31 31 31 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_TC_CT_H #define __NET_TC_CT_H #include <net/act_api.h> #include <uapi/linux/tc_act/tc_ct.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_conntrack_labels.h> struct tcf_ct_params { struct nf_conntrack_helper *helper; struct nf_conn *tmpl; u16 zone; u32 mark; u32 mark_mask; u32 labels[NF_CT_LABELS_MAX_SIZE / sizeof(u32)]; u32 labels_mask[NF_CT_LABELS_MAX_SIZE / sizeof(u32)]; struct nf_nat_range2 range; bool ipv4_range; bool put_labels; u16 ct_action; struct rcu_head rcu; struct tcf_ct_flow_table *ct_ft; struct nf_flowtable *nf_ft; }; struct tcf_ct { struct tc_action common; struct tcf_ct_params __rcu *params; }; #define to_ct(a) ((struct tcf_ct *)a) #define to_ct_params(a) \ ((struct tcf_ct_params *) \ rcu_dereference_protected(to_ct(a)->params, \ lockdep_is_held(&a->tcfa_lock))) static inline uint16_t tcf_ct_zone(const struct tc_action *a) { return to_ct_params(a)->zone; } static inline int tcf_ct_action(const struct tc_action *a) { return to_ct_params(a)->ct_action; } static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a) { return to_ct_params(a)->nf_ft; } static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a) { return to_ct_params(a)->helper; } #else static inline uint16_t tcf_ct_zone(const struct tc_action *a) { return 0; } static inline int tcf_ct_action(const struct tc_action *a) { return 0; } static inline struct nf_flowtable *tcf_ct_ft(const struct tc_action *a) { return NULL; } static inline struct nf_conntrack_helper *tcf_ct_helper(const struct tc_action *a) { return NULL; } #endif /* CONFIG_NF_CONNTRACK */ #if IS_ENABLED(CONFIG_NET_ACT_CT) static inline void tcf_ct_flow_table_restore_skb(struct sk_buff *skb, unsigned long cookie) { enum ip_conntrack_info ctinfo = cookie & NFCT_INFOMASK; struct nf_conn *ct; ct = (struct nf_conn *)(cookie & NFCT_PTRMASK); nf_conntrack_get(&ct->ct_general); nf_ct_set(skb, ct, ctinfo); } #else static inline void tcf_ct_flow_table_restore_skb(struct sk_buff *skb, unsigned long cookie) { } #endif static inline bool is_tcf_ct(const struct tc_action *a) { #if defined(CONFIG_NET_CLS_ACT) && IS_ENABLED(CONFIG_NF_CONNTRACK) if (a->ops && a->ops->id == TCA_ID_CT) return true; #endif return false; } #endif /* __NET_TC_CT_H */ |
4 1 1 2 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/bfs/inode.c * BFS superblock and inode operations. * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com> * From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds. * Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005. */ #include <linux/module.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/writeback.h> #include <linux/uio.h> #include <linux/uaccess.h> #include "bfs.h" MODULE_AUTHOR("Tigran Aivazian <aivazian.tigran@gmail.com>"); MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux"); MODULE_LICENSE("GPL"); #undef DEBUG #ifdef DEBUG #define dprintf(x...) printf(x) #else #define dprintf(x...) #endif struct inode *bfs_iget(struct super_block *sb, unsigned long ino) { struct bfs_inode *di; struct inode *inode; struct buffer_head *bh; int block, off; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(inode->i_sb)->si_lasti)) { printf("Bad inode number %s:%08lx\n", inode->i_sb->s_id, ino); goto error; } block = (ino - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; bh = sb_bread(inode->i_sb, block); if (!bh) { printf("Unable to read inode %s:%08lx\n", inode->i_sb->s_id, ino); goto error; } off = (ino - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; di = (struct bfs_inode *)bh->b_data + off; inode->i_mode = 0x0000FFFF & le32_to_cpu(di->i_mode); if (le32_to_cpu(di->i_vtype) == BFS_VDIR) { inode->i_mode |= S_IFDIR; inode->i_op = &bfs_dir_inops; inode->i_fop = &bfs_dir_operations; } else if (le32_to_cpu(di->i_vtype) == BFS_VREG) { inode->i_mode |= S_IFREG; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; inode->i_mapping->a_ops = &bfs_aops; } BFS_I(inode)->i_sblock = le32_to_cpu(di->i_sblock); BFS_I(inode)->i_eblock = le32_to_cpu(di->i_eblock); BFS_I(inode)->i_dsk_ino = le16_to_cpu(di->i_ino); i_uid_write(inode, le32_to_cpu(di->i_uid)); i_gid_write(inode, le32_to_cpu(di->i_gid)); set_nlink(inode, le32_to_cpu(di->i_nlink)); inode->i_size = BFS_FILESIZE(di); inode->i_blocks = BFS_FILEBLOCKS(di); inode_set_atime(inode, le32_to_cpu(di->i_atime), 0); inode_set_mtime(inode, le32_to_cpu(di->i_mtime), 0); inode_set_ctime(inode, le32_to_cpu(di->i_ctime), 0); brelse(bh); unlock_new_inode(inode); return inode; error: iget_failed(inode); return ERR_PTR(-EIO); } static struct bfs_inode *find_inode(struct super_block *sb, u16 ino, struct buffer_head **p) { if ((ino < BFS_ROOT_INO) || (ino > BFS_SB(sb)->si_lasti)) { printf("Bad inode number %s:%08x\n", sb->s_id, ino); return ERR_PTR(-EIO); } ino -= BFS_ROOT_INO; *p = sb_bread(sb, 1 + ino / BFS_INODES_PER_BLOCK); if (!*p) { printf("Unable to read inode %s:%08x\n", sb->s_id, ino); return ERR_PTR(-EIO); } return (struct bfs_inode *)(*p)->b_data + ino % BFS_INODES_PER_BLOCK; } static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct bfs_sb_info *info = BFS_SB(inode->i_sb); unsigned int ino = (u16)inode->i_ino; unsigned long i_sblock; struct bfs_inode *di; struct buffer_head *bh; int err = 0; dprintf("ino=%08x\n", ino); di = find_inode(inode->i_sb, ino, &bh); if (IS_ERR(di)) return PTR_ERR(di); mutex_lock(&info->bfs_lock); if (ino == BFS_ROOT_INO) di->i_vtype = cpu_to_le32(BFS_VDIR); else di->i_vtype = cpu_to_le32(BFS_VREG); di->i_ino = cpu_to_le16(ino); di->i_mode = cpu_to_le32(inode->i_mode); di->i_uid = cpu_to_le32(i_uid_read(inode)); di->i_gid = cpu_to_le32(i_gid_read(inode)); di->i_nlink = cpu_to_le32(inode->i_nlink); di->i_atime = cpu_to_le32(inode_get_atime_sec(inode)); di->i_mtime = cpu_to_le32(inode_get_mtime_sec(inode)); di->i_ctime = cpu_to_le32(inode_get_ctime_sec(inode)); i_sblock = BFS_I(inode)->i_sblock; di->i_sblock = cpu_to_le32(i_sblock); di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock); di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1); mark_buffer_dirty(bh); if (wbc->sync_mode == WB_SYNC_ALL) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) err = -EIO; } brelse(bh); mutex_unlock(&info->bfs_lock); return err; } static void bfs_evict_inode(struct inode *inode) { unsigned long ino = inode->i_ino; struct bfs_inode *di; struct buffer_head *bh; struct super_block *s = inode->i_sb; struct bfs_sb_info *info = BFS_SB(s); struct bfs_inode_info *bi = BFS_I(inode); dprintf("ino=%08lx\n", ino); truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); if (inode->i_nlink) return; di = find_inode(s, inode->i_ino, &bh); if (IS_ERR(di)) return; mutex_lock(&info->bfs_lock); /* clear on-disk inode */ memset(di, 0, sizeof(struct bfs_inode)); mark_buffer_dirty(bh); brelse(bh); if (bi->i_dsk_ino) { if (bi->i_sblock) info->si_freeb += bi->i_eblock + 1 - bi->i_sblock; info->si_freei++; clear_bit(ino, info->si_imap); bfs_dump_imap("evict_inode", s); } /* * If this was the last file, make the previous block * "last block of the last file" even if there is no * real file there, saves us 1 gap. */ if (info->si_lf_eblk == bi->i_eblock) info->si_lf_eblk = bi->i_sblock - 1; mutex_unlock(&info->bfs_lock); } static void bfs_put_super(struct super_block *s) { struct bfs_sb_info *info = BFS_SB(s); if (!info) return; mutex_destroy(&info->bfs_lock); kfree(info); s->s_fs_info = NULL; } static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *s = dentry->d_sb; struct bfs_sb_info *info = BFS_SB(s); u64 id = huge_encode_dev(s->s_bdev->bd_dev); buf->f_type = BFS_MAGIC; buf->f_bsize = s->s_blocksize; buf->f_blocks = info->si_blocks; buf->f_bfree = buf->f_bavail = info->si_freeb; buf->f_files = info->si_lasti + 1 - BFS_ROOT_INO; buf->f_ffree = info->si_freei; buf->f_fsid = u64_to_fsid(id); buf->f_namelen = BFS_NAMELEN; return 0; } static struct kmem_cache *bfs_inode_cachep; static struct inode *bfs_alloc_inode(struct super_block *sb) { struct bfs_inode_info *bi; bi = alloc_inode_sb(sb, bfs_inode_cachep, GFP_KERNEL); if (!bi) return NULL; return &bi->vfs_inode; } static void bfs_free_inode(struct inode *inode) { kmem_cache_free(bfs_inode_cachep, BFS_I(inode)); } static void init_once(void *foo) { struct bfs_inode_info *bi = foo; inode_init_once(&bi->vfs_inode); } static int __init init_inodecache(void) { bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", sizeof(struct bfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), init_once); if (bfs_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(bfs_inode_cachep); } static const struct super_operations bfs_sops = { .alloc_inode = bfs_alloc_inode, .free_inode = bfs_free_inode, .write_inode = bfs_write_inode, .evict_inode = bfs_evict_inode, .put_super = bfs_put_super, .statfs = bfs_statfs, }; void bfs_dump_imap(const char *prefix, struct super_block *s) { #ifdef DEBUG int i; char *tmpbuf = (char *)get_zeroed_page(GFP_KERNEL); if (!tmpbuf) return; for (i = BFS_SB(s)->si_lasti; i >= 0; i--) { if (i > PAGE_SIZE - 100) break; if (test_bit(i, BFS_SB(s)->si_imap)) strcat(tmpbuf, "1"); else strcat(tmpbuf, "0"); } printf("%s: lasti=%08lx <%s>\n", prefix, BFS_SB(s)->si_lasti, tmpbuf); free_page((unsigned long)tmpbuf); #endif } static int bfs_fill_super(struct super_block *s, void *data, int silent) { struct buffer_head *bh, *sbh; struct bfs_super_block *bfs_sb; struct inode *inode; unsigned i; struct bfs_sb_info *info; int ret = -EINVAL; unsigned long i_sblock, i_eblock, i_eoff, s_size; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; mutex_init(&info->bfs_lock); s->s_fs_info = info; s->s_time_min = 0; s->s_time_max = U32_MAX; sb_set_blocksize(s, BFS_BSIZE); sbh = sb_bread(s, 0); if (!sbh) goto out; bfs_sb = (struct bfs_super_block *)sbh->b_data; if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { if (!silent) printf("No BFS filesystem on %s (magic=%08x)\n", s->s_id, le32_to_cpu(bfs_sb->s_magic)); goto out1; } if (BFS_UNCLEAN(bfs_sb, s) && !silent) printf("%s is unclean, continuing\n", s->s_id); s->s_magic = BFS_MAGIC; if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end) || le32_to_cpu(bfs_sb->s_start) < sizeof(struct bfs_super_block) + sizeof(struct bfs_dirent)) { printf("Superblock is corrupted on %s\n", s->s_id); goto out1; } info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1; if (info->si_lasti == BFS_MAX_LASTI) printf("NOTE: filesystem %s was created with 512 inodes, the real maximum is 511, mounting anyway\n", s->s_id); else if (info->si_lasti > BFS_MAX_LASTI) { printf("Impossible last inode number %lu > %d on %s\n", info->si_lasti, BFS_MAX_LASTI, s->s_id); goto out1; } for (i = 0; i < BFS_ROOT_INO; i++) set_bit(i, info->si_imap); s->s_op = &bfs_sops; inode = bfs_iget(s, BFS_ROOT_INO); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto out1; } s->s_root = d_make_root(inode); if (!s->s_root) { ret = -ENOMEM; goto out1; } info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS; info->si_freeb = (le32_to_cpu(bfs_sb->s_end) + 1 - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; info->si_freei = 0; info->si_lf_eblk = 0; /* can we read the last block? */ bh = sb_bread(s, info->si_blocks - 1); if (!bh) { printf("Last block not available on %s: %lu\n", s->s_id, info->si_blocks - 1); ret = -EIO; goto out2; } brelse(bh); bh = NULL; for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) { struct bfs_inode *di; int block = (i - BFS_ROOT_INO) / BFS_INODES_PER_BLOCK + 1; int off = (i - BFS_ROOT_INO) % BFS_INODES_PER_BLOCK; unsigned long eblock; if (!off) { brelse(bh); bh = sb_bread(s, block); } if (!bh) continue; di = (struct bfs_inode *)bh->b_data + off; /* test if filesystem is not corrupted */ i_eoff = le32_to_cpu(di->i_eoffset); i_sblock = le32_to_cpu(di->i_sblock); i_eblock = le32_to_cpu(di->i_eblock); s_size = le32_to_cpu(bfs_sb->s_end); if (i_sblock > info->si_blocks || i_eblock > info->si_blocks || i_sblock > i_eblock || (i_eoff != le32_to_cpu(-1) && i_eoff > s_size) || i_sblock * BFS_BSIZE > i_eoff) { printf("Inode 0x%08x corrupted on %s\n", i, s->s_id); brelse(bh); ret = -EIO; goto out2; } if (!di->i_ino) { info->si_freei++; continue; } set_bit(i, info->si_imap); info->si_freeb -= BFS_FILEBLOCKS(di); eblock = le32_to_cpu(di->i_eblock); if (eblock > info->si_lf_eblk) info->si_lf_eblk = eblock; } brelse(bh); brelse(sbh); bfs_dump_imap("fill_super", s); return 0; out2: dput(s->s_root); s->s_root = NULL; out1: brelse(sbh); out: mutex_destroy(&info->bfs_lock); kfree(info); s->s_fs_info = NULL; return ret; } static struct dentry *bfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super); } static struct file_system_type bfs_fs_type = { .owner = THIS_MODULE, .name = "bfs", .mount = bfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("bfs"); static int __init init_bfs_fs(void) { int err = init_inodecache(); if (err) goto out1; err = register_filesystem(&bfs_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); out1: return err; } static void __exit exit_bfs_fs(void) { unregister_filesystem(&bfs_fs_type); destroy_inodecache(); } module_init(init_bfs_fs) module_exit(exit_bfs_fs) |
1053 336 1056 2016 843 1105 1074 805 917 1829 1711 1579 1741 6 4 2 1 1 2037 2038 322 321 67 70 70 28 161 160 152 152 82 80 38 38 5 24 10 19 14 30 30 4 1830 1832 1830 1817 432 1828 1828 1827 334 334 85 629 85 629 361 361 269 268 95 1260 457 454 457 4 4 26 5 21 4 3 1 239 239 6 1168 1169 338 112 1079 28 1057 204 94 133 204 865 865 336 781 418 336 864 862 29 865 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 | // SPDX-License-Identifier: GPL-2.0-only /* * proc/fs/generic.c --- generic routines for the proc-fs * * This file contains generic proc-fs routines for handling * directories and files. * * Copyright (C) 1991, 1992 Linus Torvalds. * Copyright (C) 1997 Theodore Ts'o */ #include <linux/cache.h> #include <linux/errno.h> #include <linux/time.h> #include <linux/proc_fs.h> #include <linux/stat.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/slab.h> #include <linux/printk.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/idr.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/uaccess.h> #include <linux/seq_file.h> #include "internal.h" static DEFINE_RWLOCK(proc_subdir_lock); struct kmem_cache *proc_dir_entry_cache __ro_after_init; void pde_free(struct proc_dir_entry *pde) { if (S_ISLNK(pde->mode)) kfree(pde->data); if (pde->name != pde->inline_name) kfree(pde->name); kmem_cache_free(proc_dir_entry_cache, pde); } static int proc_match(const char *name, struct proc_dir_entry *de, unsigned int len) { if (len < de->namelen) return -1; if (len > de->namelen) return 1; return memcmp(name, de->name, len); } static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) { return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry, subdir_node); } static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) { return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry, subdir_node); } static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, const char *name, unsigned int len) { struct rb_node *node = dir->subdir.rb_node; while (node) { struct proc_dir_entry *de = rb_entry(node, struct proc_dir_entry, subdir_node); int result = proc_match(name, de, len); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return de; } return NULL; } static bool pde_subdir_insert(struct proc_dir_entry *dir, struct proc_dir_entry *de) { struct rb_root *root = &dir->subdir; struct rb_node **new = &root->rb_node, *parent = NULL; /* Figure out where to put new node */ while (*new) { struct proc_dir_entry *this = rb_entry(*new, struct proc_dir_entry, subdir_node); int result = proc_match(de->name, this, de->namelen); parent = *new; if (result < 0) new = &(*new)->rb_left; else if (result > 0) new = &(*new)->rb_right; else return false; } /* Add new node and rebalance tree. */ rb_link_node(&de->subdir_node, parent, new); rb_insert_color(&de->subdir_node, root); return true; } static int proc_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct proc_dir_entry *de = PDE(inode); int error; error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) return error; setattr_copy(&nop_mnt_idmap, inode, iattr); proc_set_user(de, inode->i_uid, inode->i_gid); de->mode = inode->i_mode; return 0; } static int proc_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct proc_dir_entry *de = PDE(inode); if (de) { nlink_t nlink = READ_ONCE(de->nlink); if (nlink > 0) { set_nlink(inode, nlink); } } generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } static const struct inode_operations proc_file_inode_operations = { .setattr = proc_notify_change, }; /* * This function parses a name such as "tty/driver/serial", and * returns the struct proc_dir_entry for "/proc/tty/driver", and * returns "serial" in residual. */ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, const char **residual) { const char *cp = name, *next; struct proc_dir_entry *de; de = *ret ?: &proc_root; while ((next = strchr(cp, '/')) != NULL) { de = pde_subdir_find(de, cp, next - cp); if (!de) { WARN(1, "name '%s'\n", name); return -ENOENT; } cp = next + 1; } *residual = cp; *ret = de; return 0; } static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, const char **residual) { int rv; read_lock(&proc_subdir_lock); rv = __xlate_proc_name(name, ret, residual); read_unlock(&proc_subdir_lock); return rv; } static DEFINE_IDA(proc_inum_ida); #define PROC_DYNAMIC_FIRST 0xF0000000U /* * Return an inode number between PROC_DYNAMIC_FIRST and * 0xffffffff, or zero on failure. */ int proc_alloc_inum(unsigned int *inum) { int i; i = ida_alloc_max(&proc_inum_ida, UINT_MAX - PROC_DYNAMIC_FIRST, GFP_KERNEL); if (i < 0) return i; *inum = PROC_DYNAMIC_FIRST + (unsigned int)i; return 0; } void proc_free_inum(unsigned int inum) { ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); } static int proc_misc_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; if (atomic_read(&PDE(d_inode(dentry))->in_use) < 0) return 0; /* revalidate */ return 1; } static int proc_misc_d_delete(const struct dentry *dentry) { return atomic_read(&PDE(d_inode(dentry))->in_use) < 0; } static const struct dentry_operations proc_misc_dentry_ops = { .d_revalidate = proc_misc_d_revalidate, .d_delete = proc_misc_d_delete, }; /* * Don't create negative dentries here, return -ENOENT by hand * instead. */ struct dentry *proc_lookup_de(struct inode *dir, struct dentry *dentry, struct proc_dir_entry *de) { struct inode *inode; read_lock(&proc_subdir_lock); de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); if (de) { pde_get(de); read_unlock(&proc_subdir_lock); inode = proc_get_inode(dir->i_sb, de); if (!inode) return ERR_PTR(-ENOMEM); d_set_d_op(dentry, de->proc_dops); return d_splice_alias(inode, dentry); } read_unlock(&proc_subdir_lock); return ERR_PTR(-ENOENT); } struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct proc_fs_info *fs_info = proc_sb_info(dir->i_sb); if (fs_info->pidonly == PROC_PIDONLY_ON) return ERR_PTR(-ENOENT); return proc_lookup_de(dir, dentry, PDE(dir)); } /* * This returns non-zero if at EOF, so that the /proc * root directory can use this and check if it should * continue with the <pid> entries.. * * Note that the VFS-layer doesn't care about the return * value of the readdir() call, as long as it's non-negative * for success.. */ int proc_readdir_de(struct file *file, struct dir_context *ctx, struct proc_dir_entry *de) { int i; if (!dir_emit_dots(file, ctx)) return 0; i = ctx->pos - 2; read_lock(&proc_subdir_lock); de = pde_subdir_first(de); for (;;) { if (!de) { read_unlock(&proc_subdir_lock); return 0; } if (!i) break; de = pde_subdir_next(de); i--; } do { struct proc_dir_entry *next; pde_get(de); read_unlock(&proc_subdir_lock); if (!dir_emit(ctx, de->name, de->namelen, de->low_ino, de->mode >> 12)) { pde_put(de); return 0; } ctx->pos++; read_lock(&proc_subdir_lock); next = pde_subdir_next(de); pde_put(de); de = next; } while (de); read_unlock(&proc_subdir_lock); return 1; } int proc_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); if (fs_info->pidonly == PROC_PIDONLY_ON) return 1; return proc_readdir_de(file, ctx, PDE(inode)); } /* * These are the generic /proc directory operations. They * use the in-memory "struct proc_dir_entry" tree to parse * the /proc directory. */ static const struct file_operations proc_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = proc_readdir, }; static int proc_net_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { return 0; } const struct dentry_operations proc_net_dentry_ops = { .d_revalidate = proc_net_d_revalidate, .d_delete = always_delete_dentry, }; /* * proc directories can do almost nothing.. */ static const struct inode_operations proc_dir_inode_operations = { .lookup = proc_lookup, .getattr = proc_getattr, .setattr = proc_notify_change, }; /* returns the registered entry, or frees dp and returns NULL on failure */ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, struct proc_dir_entry *dp) { if (proc_alloc_inum(&dp->low_ino)) goto out_free_entry; write_lock(&proc_subdir_lock); dp->parent = dir; if (pde_subdir_insert(dir, dp) == false) { WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); write_unlock(&proc_subdir_lock); goto out_free_inum; } dir->nlink++; write_unlock(&proc_subdir_lock); return dp; out_free_inum: proc_free_inum(dp->low_ino); out_free_entry: pde_free(dp); return NULL; } static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, const char *name, umode_t mode, nlink_t nlink) { struct proc_dir_entry *ent = NULL; const char *fn; struct qstr qstr; if (xlate_proc_name(name, parent, &fn) != 0) goto out; qstr.name = fn; qstr.len = strlen(fn); if (qstr.len == 0 || qstr.len >= 256) { WARN(1, "name len %u\n", qstr.len); return NULL; } if (qstr.len == 1 && fn[0] == '.') { WARN(1, "name '.'\n"); return NULL; } if (qstr.len == 2 && fn[0] == '.' && fn[1] == '.') { WARN(1, "name '..'\n"); return NULL; } if (*parent == &proc_root && name_to_int(&qstr) != ~0U) { WARN(1, "create '/proc/%s' by hand\n", qstr.name); return NULL; } if (is_empty_pde(*parent)) { WARN(1, "attempt to add to permanently empty directory"); return NULL; } ent = kmem_cache_zalloc(proc_dir_entry_cache, GFP_KERNEL); if (!ent) goto out; if (qstr.len + 1 <= SIZEOF_PDE_INLINE_NAME) { ent->name = ent->inline_name; } else { ent->name = kmalloc(qstr.len + 1, GFP_KERNEL); if (!ent->name) { pde_free(ent); return NULL; } } memcpy(ent->name, fn, qstr.len + 1); ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; ent->subdir = RB_ROOT; refcount_set(&ent->refcnt, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); proc_set_user(ent, (*parent)->uid, (*parent)->gid); ent->proc_dops = &proc_misc_dentry_ops; /* Revalidate everything under /proc/${pid}/net */ if ((*parent)->proc_dops == &proc_net_dentry_ops) pde_force_lookup(ent); out: return ent; } struct proc_dir_entry *proc_symlink(const char *name, struct proc_dir_entry *parent, const char *dest) { struct proc_dir_entry *ent; ent = __proc_create(&parent, name, (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1); if (ent) { ent->size = strlen(dest); ent->data = kmemdup(dest, ent->size + 1, GFP_KERNEL); if (ent->data) { ent->proc_iops = &proc_link_inode_operations; ent = proc_register(parent, ent); } else { pde_free(ent); ent = NULL; } } return ent; } EXPORT_SYMBOL(proc_symlink); struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode, struct proc_dir_entry *parent, void *data, bool force_lookup) { struct proc_dir_entry *ent; if (mode == 0) mode = S_IRUGO | S_IXUGO; ent = __proc_create(&parent, name, S_IFDIR | mode, 2); if (ent) { ent->data = data; ent->proc_dir_ops = &proc_dir_operations; ent->proc_iops = &proc_dir_inode_operations; if (force_lookup) { pde_force_lookup(ent); } ent = proc_register(parent, ent); } return ent; } EXPORT_SYMBOL_GPL(_proc_mkdir); struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, struct proc_dir_entry *parent, void *data) { return _proc_mkdir(name, mode, parent, data, false); } EXPORT_SYMBOL_GPL(proc_mkdir_data); struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, struct proc_dir_entry *parent) { return proc_mkdir_data(name, mode, parent, NULL); } EXPORT_SYMBOL(proc_mkdir_mode); struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent) { return proc_mkdir_data(name, 0, parent, NULL); } EXPORT_SYMBOL(proc_mkdir); struct proc_dir_entry *proc_create_mount_point(const char *name) { umode_t mode = S_IFDIR | S_IRUGO | S_IXUGO; struct proc_dir_entry *ent, *parent = NULL; ent = __proc_create(&parent, name, mode, 2); if (ent) { ent->data = NULL; ent->proc_dir_ops = NULL; ent->proc_iops = NULL; ent = proc_register(parent, ent); } return ent; } EXPORT_SYMBOL(proc_create_mount_point); struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, struct proc_dir_entry **parent, void *data) { struct proc_dir_entry *p; if ((mode & S_IFMT) == 0) mode |= S_IFREG; if ((mode & S_IALLUGO) == 0) mode |= S_IRUGO; if (WARN_ON_ONCE(!S_ISREG(mode))) return NULL; p = __proc_create(parent, name, mode, 1); if (p) { p->proc_iops = &proc_file_inode_operations; p->data = data; } return p; } static inline void pde_set_flags(struct proc_dir_entry *pde) { if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) pde->flags |= PROC_ENTRY_PERMANENT; } struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops, void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; p->proc_ops = proc_ops; pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_data); struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops) { return proc_create_data(name, mode, parent, proc_ops, NULL); } EXPORT_SYMBOL(proc_create); static int proc_seq_open(struct inode *inode, struct file *file) { struct proc_dir_entry *de = PDE(inode); if (de->state_size) return seq_open_private(file, de->seq_ops, de->state_size); return seq_open(file, de->seq_ops); } static int proc_seq_release(struct inode *inode, struct file *file) { struct proc_dir_entry *de = PDE(inode); if (de->state_size) return seq_release_private(inode, file); return seq_release(inode, file); } static const struct proc_ops proc_seq_ops = { /* not permanent -- can call into arbitrary seq_operations */ .proc_open = proc_seq_open, .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = proc_seq_release, }; struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, unsigned int state_size, void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; p->proc_ops = &proc_seq_ops; p->seq_ops = ops; p->state_size = state_size; return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_seq_private); static int proc_single_open(struct inode *inode, struct file *file) { struct proc_dir_entry *de = PDE(inode); return single_open(file, de->single_show, de->data); } static const struct proc_ops proc_single_ops = { /* not permanent -- can call into arbitrary ->single_show */ .proc_open = proc_single_open, .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = single_release, }; struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, struct proc_dir_entry *parent, int (*show)(struct seq_file *, void *), void *data) { struct proc_dir_entry *p; p = proc_create_reg(name, mode, &parent, data); if (!p) return NULL; p->proc_ops = &proc_single_ops; p->single_show = show; return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_single_data); void proc_set_size(struct proc_dir_entry *de, loff_t size) { de->size = size; } EXPORT_SYMBOL(proc_set_size); void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) { de->uid = uid; de->gid = gid; } EXPORT_SYMBOL(proc_set_user); void pde_put(struct proc_dir_entry *pde) { if (refcount_dec_and_test(&pde->refcnt)) { proc_free_inum(pde->low_ino); pde_free(pde); } } /* * Remove a /proc entry and free it if it's not currently in use. */ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) { struct proc_dir_entry *de = NULL; const char *fn = name; unsigned int len; write_lock(&proc_subdir_lock); if (__xlate_proc_name(name, &parent, &fn) != 0) { write_unlock(&proc_subdir_lock); return; } len = strlen(fn); de = pde_subdir_find(parent, fn, len); if (de) { if (unlikely(pde_is_permanent(de))) { WARN(1, "removing permanent /proc entry '%s'", de->name); de = NULL; } else { rb_erase(&de->subdir_node, &parent->subdir); if (S_ISDIR(de->mode)) parent->nlink--; } } write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); return; } proc_entry_rundown(de); WARN(pde_subdir_first(de), "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n", __func__, de->parent->name, de->name, pde_subdir_first(de)->name); pde_put(de); } EXPORT_SYMBOL(remove_proc_entry); int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { struct proc_dir_entry *root = NULL, *de, *next; const char *fn = name; unsigned int len; write_lock(&proc_subdir_lock); if (__xlate_proc_name(name, &parent, &fn) != 0) { write_unlock(&proc_subdir_lock); return -ENOENT; } len = strlen(fn); root = pde_subdir_find(parent, fn, len); if (!root) { write_unlock(&proc_subdir_lock); return -ENOENT; } if (unlikely(pde_is_permanent(root))) { write_unlock(&proc_subdir_lock); WARN(1, "removing permanent /proc entry '%s/%s'", root->parent->name, root->name); return -EINVAL; } rb_erase(&root->subdir_node, &parent->subdir); de = root; while (1) { next = pde_subdir_first(de); if (next) { if (unlikely(pde_is_permanent(next))) { write_unlock(&proc_subdir_lock); WARN(1, "removing permanent /proc entry '%s/%s'", next->parent->name, next->name); return -EINVAL; } rb_erase(&next->subdir_node, &de->subdir); de = next; continue; } next = de->parent; if (S_ISDIR(de->mode)) next->nlink--; write_unlock(&proc_subdir_lock); proc_entry_rundown(de); if (de == root) break; pde_put(de); write_lock(&proc_subdir_lock); de = next; } pde_put(root); return 0; } EXPORT_SYMBOL(remove_proc_subtree); void *proc_get_parent_data(const struct inode *inode) { struct proc_dir_entry *de = PDE(inode); return de->parent->data; } EXPORT_SYMBOL_GPL(proc_get_parent_data); void proc_remove(struct proc_dir_entry *de) { if (de) remove_proc_subtree(de->name, de->parent); } EXPORT_SYMBOL(proc_remove); /* * Pull a user buffer into memory and pass it to the file's write handler if * one is supplied. The ->write() method is permitted to modify the * kernel-side buffer. */ ssize_t proc_simple_write(struct file *f, const char __user *ubuf, size_t size, loff_t *_pos) { struct proc_dir_entry *pde = PDE(file_inode(f)); char *buf; int ret; if (!pde->write) return -EACCES; if (size == 0 || size > PAGE_SIZE - 1) return -EINVAL; buf = memdup_user_nul(ubuf, size); if (IS_ERR(buf)) return PTR_ERR(buf); ret = pde->write(f, buf, size); kfree(buf); return ret == 0 ? size : ret; } |
2 2 4 10 4 2 4 4 2 3 1 1 1 1 2 2 3 5 5 3 5 5 5 5 5 4 1 4 4 2 4 1 3 1 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" static inline bool devlink_rate_is_leaf(struct devlink_rate *devlink_rate) { return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; } static inline bool devlink_rate_is_node(struct devlink_rate *devlink_rate) { return devlink_rate->type == DEVLINK_RATE_TYPE_NODE; } static struct devlink_rate * devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) { struct devlink_rate *devlink_rate; struct devlink_port *devlink_port; devlink_port = devlink_port_get_from_attrs(devlink, info->attrs); if (IS_ERR(devlink_port)) return ERR_CAST(devlink_port); devlink_rate = devlink_port->devlink_rate; return devlink_rate ?: ERR_PTR(-ENODEV); } static struct devlink_rate * devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) { static struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate) && !strcmp(node_name, devlink_rate->name)) return devlink_rate; } return ERR_PTR(-ENODEV); } static struct devlink_rate * devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs) { const char *rate_node_name; size_t len; if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME]) return ERR_PTR(-EINVAL); rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]); len = strlen(rate_node_name); /* Name cannot be empty or decimal number */ if (!len || strspn(rate_node_name, "0123456789") == len) return ERR_PTR(-EINVAL); return devlink_rate_node_get_by_name(devlink, rate_node_name); } static struct devlink_rate * devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info) { return devlink_rate_node_get_from_attrs(devlink, info->attrs); } static struct devlink_rate * devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) { struct nlattr **attrs = info->attrs; if (attrs[DEVLINK_ATTR_PORT_INDEX]) return devlink_rate_leaf_get_from_info(devlink, info); else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME]) return devlink_rate_node_get_from_info(devlink, info); else return ERR_PTR(-EINVAL); } static int devlink_nl_rate_fill(struct sk_buff *msg, struct devlink_rate *devlink_rate, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { struct devlink *devlink = devlink_rate->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto nla_put_failure; if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type)) goto nla_put_failure; if (devlink_rate_is_leaf(devlink_rate)) { if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_rate->devlink_port->index)) goto nla_put_failure; } else if (devlink_rate_is_node(devlink_rate)) { if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME, devlink_rate->name)) goto nla_put_failure; } if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_SHARE, devlink_rate->tx_share)) goto nla_put_failure; if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_MAX, devlink_rate->tx_max)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY, devlink_rate->tx_priority)) goto nla_put_failure; if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT, devlink_rate->tx_weight)) goto nla_put_failure; if (devlink_rate->parent) if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, devlink_rate->parent->name)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_rate_notify(struct devlink_rate *devlink_rate, enum devlink_command cmd) { struct devlink *devlink = devlink_rate->devlink; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL); if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } devlink_nl_notify_send(devlink, msg); } void devlink_rates_notify_register(struct devlink *devlink) { struct devlink_rate *rate_node; list_for_each_entry(rate_node, &devlink->rate_list, list) devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); } void devlink_rates_notify_unregister(struct devlink *devlink) { struct devlink_rate *rate_node; list_for_each_entry_reverse(rate_node, &devlink->rate_list, list) devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); } static int devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_rate *devlink_rate; int idx = 0; int err = 0; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; u32 id = NETLINK_CB(cb->skb).portid; if (idx < state->idx) { idx++; continue; } err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, cb->nlh->nlmsg_seq, flags, NULL); if (err) { state->idx = idx; break; } idx++; } return err; } int devlink_nl_rate_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_rate_get_dump_one); } int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *devlink_rate; struct sk_buff *msg; int err; devlink_rate = devlink_rate_get_from_info(devlink, info); if (IS_ERR(devlink_rate)) return PTR_ERR(devlink_rate); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static bool devlink_rate_is_parent_node(struct devlink_rate *devlink_rate, struct devlink_rate *parent) { while (parent) { if (parent == devlink_rate) return true; parent = parent->parent; } return false; } static int devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, struct genl_info *info, struct nlattr *nla_parent) { struct devlink *devlink = devlink_rate->devlink; const char *parent_name = nla_data(nla_parent); const struct devlink_ops *ops = devlink->ops; size_t len = strlen(parent_name); struct devlink_rate *parent; int err = -EOPNOTSUPP; parent = devlink_rate->parent; if (parent && !len) { if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, info->extack); if (err) return err; refcount_dec(&parent->refcnt); devlink_rate->parent = NULL; } else if (len) { parent = devlink_rate_node_get_by_name(devlink, parent_name); if (IS_ERR(parent)) return -ENODEV; if (parent == devlink_rate) { NL_SET_ERR_MSG(info->extack, "Parent to self is not allowed"); return -EINVAL; } if (devlink_rate_is_node(devlink_rate) && devlink_rate_is_parent_node(devlink_rate, parent->parent)) { NL_SET_ERR_MSG(info->extack, "Node is already a parent of parent node."); return -EEXIST; } if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_parent_set(devlink_rate, parent, devlink_rate->priv, parent->priv, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_parent_set(devlink_rate, parent, devlink_rate->priv, parent->priv, info->extack); if (err) return err; if (devlink_rate->parent) /* we're reassigning to other parent in this case */ refcount_dec(&devlink_rate->parent->refcnt); refcount_inc(&parent->refcnt); devlink_rate->parent = parent; } return 0; } static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, const struct devlink_ops *ops, struct genl_info *info) { struct nlattr *nla_parent, **attrs = info->attrs; int err = -EOPNOTSUPP; u32 priority; u32 weight; u64 rate; if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv, rate, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv, rate, info->extack); if (err) return err; devlink_rate->tx_share = rate; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) { rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv, rate, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv, rate, info->extack); if (err) return err; devlink_rate->tx_max = rate; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) { priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv, priority, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv, priority, info->extack); if (err) return err; devlink_rate->tx_priority = priority; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) { weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]); if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv, weight, info->extack); else if (devlink_rate_is_node(devlink_rate)) err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv, weight, info->extack); if (err) return err; devlink_rate->tx_weight = weight; } nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; if (nla_parent) { err = devlink_nl_rate_parent_node_set(devlink_rate, info, nla_parent); if (err) return err; } return 0; } static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, struct genl_info *info, enum devlink_rate_type type) { struct nlattr **attrs = info->attrs; if (type == DEVLINK_RATE_TYPE_LEAF) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) { NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && !ops->rate_leaf_parent_set) { NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], "TX priority set isn't supported for the leafs"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], "TX weight set isn't supported for the leafs"); return false; } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) { NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] && !ops->rate_node_parent_set) { NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], "TX priority set isn't supported for the nodes"); return false; } if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) { NL_SET_ERR_MSG_ATTR(info->extack, attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], "TX weight set isn't supported for the nodes"); return false; } } else { WARN(1, "Unknown type of rate object"); return false; } return true; } int devlink_nl_rate_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *devlink_rate; const struct devlink_ops *ops; int err; devlink_rate = devlink_rate_get_from_info(devlink, info); if (IS_ERR(devlink_rate)) return PTR_ERR(devlink_rate); ops = devlink->ops; if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type)) return -EOPNOTSUPP; err = devlink_nl_rate_set(devlink_rate, ops, info); if (!err) devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); return err; } int devlink_nl_rate_new_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *rate_node; const struct devlink_ops *ops; int err; ops = devlink->ops; if (!ops || !ops->rate_node_new || !ops->rate_node_del) { NL_SET_ERR_MSG(info->extack, "Rate nodes aren't supported"); return -EOPNOTSUPP; } if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE)) return -EOPNOTSUPP; rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs); if (!IS_ERR(rate_node)) return -EEXIST; else if (rate_node == ERR_PTR(-EINVAL)) return -EINVAL; rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); if (!rate_node) return -ENOMEM; rate_node->devlink = devlink; rate_node->type = DEVLINK_RATE_TYPE_NODE; rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL); if (!rate_node->name) { err = -ENOMEM; goto err_strdup; } err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack); if (err) goto err_node_new; err = devlink_nl_rate_set(rate_node, ops, info); if (err) goto err_rate_set; refcount_set(&rate_node->refcnt, 1); list_add(&rate_node->list, &devlink->rate_list); devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); return 0; err_rate_set: ops->rate_node_del(rate_node, rate_node->priv, info->extack); err_node_new: kfree(rate_node->name); err_strdup: kfree(rate_node); return err; } int devlink_nl_rate_del_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_rate *rate_node; int err; rate_node = devlink_rate_node_get_from_info(devlink, info); if (IS_ERR(rate_node)) return PTR_ERR(rate_node); if (refcount_read(&rate_node->refcnt) > 1) { NL_SET_ERR_MSG(info->extack, "Node has children. Cannot delete node."); return -EBUSY; } devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); err = devlink->ops->rate_node_del(rate_node, rate_node->priv, info->extack); if (rate_node->parent) refcount_dec(&rate_node->parent->refcnt); list_del(&rate_node->list); kfree(rate_node->name); kfree(rate_node); return err; } int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack) { struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) if (devlink_rate_is_node(devlink_rate)) { NL_SET_ERR_MSG(extack, "Rate node(s) exists."); return -EBUSY; } return 0; } /** * devl_rate_node_create - create devlink rate node * @devlink: devlink instance * @priv: driver private data * @node_name: name of the resulting node * @parent: parent devlink_rate struct * * Create devlink rate object of type node */ struct devlink_rate * devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, struct devlink_rate *parent) { struct devlink_rate *rate_node; rate_node = devlink_rate_node_get_by_name(devlink, node_name); if (!IS_ERR(rate_node)) return ERR_PTR(-EEXIST); rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); if (!rate_node) return ERR_PTR(-ENOMEM); if (parent) { rate_node->parent = parent; refcount_inc(&rate_node->parent->refcnt); } rate_node->type = DEVLINK_RATE_TYPE_NODE; rate_node->devlink = devlink; rate_node->priv = priv; rate_node->name = kstrdup(node_name, GFP_KERNEL); if (!rate_node->name) { kfree(rate_node); return ERR_PTR(-ENOMEM); } refcount_set(&rate_node->refcnt, 1); list_add(&rate_node->list, &devlink->rate_list); devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); return rate_node; } EXPORT_SYMBOL_GPL(devl_rate_node_create); /** * devl_rate_leaf_create - create devlink rate leaf * @devlink_port: devlink port object to create rate object on * @priv: driver private data * @parent: parent devlink_rate struct * * Create devlink rate object of type leaf on provided @devlink_port. */ int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, struct devlink_rate *parent) { struct devlink *devlink = devlink_port->devlink; struct devlink_rate *devlink_rate; devl_assert_locked(devlink_port->devlink); if (WARN_ON(devlink_port->devlink_rate)) return -EBUSY; devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL); if (!devlink_rate) return -ENOMEM; if (parent) { devlink_rate->parent = parent; refcount_inc(&devlink_rate->parent->refcnt); } devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; devlink_rate->devlink = devlink; devlink_rate->devlink_port = devlink_port; devlink_rate->priv = priv; list_add_tail(&devlink_rate->list, &devlink->rate_list); devlink_port->devlink_rate = devlink_rate; devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW); return 0; } EXPORT_SYMBOL_GPL(devl_rate_leaf_create); /** * devl_rate_leaf_destroy - destroy devlink rate leaf * * @devlink_port: devlink port linked to the rate object * * Destroy the devlink rate object of type leaf on provided @devlink_port. */ void devl_rate_leaf_destroy(struct devlink_port *devlink_port) { struct devlink_rate *devlink_rate = devlink_port->devlink_rate; devl_assert_locked(devlink_port->devlink); if (!devlink_rate) return; devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL); if (devlink_rate->parent) refcount_dec(&devlink_rate->parent->refcnt); list_del(&devlink_rate->list); devlink_port->devlink_rate = NULL; kfree(devlink_rate); } EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); /** * devl_rate_nodes_destroy - destroy all devlink rate nodes on device * @devlink: devlink instance * * Unset parent for all rate objects and destroy all rate nodes * on specified device. */ void devl_rate_nodes_destroy(struct devlink *devlink) { static struct devlink_rate *devlink_rate, *tmp; const struct devlink_ops *ops = devlink->ops; devl_assert_locked(devlink); list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (!devlink_rate->parent) continue; refcount_dec(&devlink_rate->parent->refcnt); if (devlink_rate_is_leaf(devlink_rate)) ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); else if (devlink_rate_is_node(devlink_rate)) ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); } list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate)) { ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL); list_del(&devlink_rate->list); kfree(devlink_rate->name); kfree(devlink_rate); } } } EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy); |
1 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_DMA_MAPPING_H #define _LINUX_DMA_MAPPING_H #include <linux/device.h> #include <linux/err.h> #include <linux/dma-direction.h> #include <linux/scatterlist.h> #include <linux/bug.h> /** * List of possible attributes associated with a DMA mapping. The semantics * of each attribute should be defined in Documentation/core-api/dma-attributes.rst. */ /* * DMA_ATTR_WEAK_ORDERING: Specifies that reads and writes to the mapping * may be weakly ordered, that is that reads and writes may pass each other. */ #define DMA_ATTR_WEAK_ORDERING (1UL << 1) /* * DMA_ATTR_WRITE_COMBINE: Specifies that writes to the mapping may be * buffered to improve performance. */ #define DMA_ATTR_WRITE_COMBINE (1UL << 2) /* * DMA_ATTR_NO_KERNEL_MAPPING: Lets the platform to avoid creating a kernel * virtual mapping for the allocated buffer. */ #define DMA_ATTR_NO_KERNEL_MAPPING (1UL << 4) /* * DMA_ATTR_SKIP_CPU_SYNC: Allows platform code to skip synchronization of * the CPU cache for the given buffer assuming that it has been already * transferred to 'device' domain. */ #define DMA_ATTR_SKIP_CPU_SYNC (1UL << 5) /* * DMA_ATTR_FORCE_CONTIGUOUS: Forces contiguous allocation of the buffer * in physical memory. */ #define DMA_ATTR_FORCE_CONTIGUOUS (1UL << 6) /* * DMA_ATTR_ALLOC_SINGLE_PAGES: This is a hint to the DMA-mapping subsystem * that it's probably not worth the time to try to allocate memory to in a way * that gives better TLB efficiency. */ #define DMA_ATTR_ALLOC_SINGLE_PAGES (1UL << 7) /* * DMA_ATTR_NO_WARN: This tells the DMA-mapping subsystem to suppress * allocation failure reports (similarly to __GFP_NOWARN). */ #define DMA_ATTR_NO_WARN (1UL << 8) /* * DMA_ATTR_PRIVILEGED: used to indicate that the buffer is fully * accessible at an elevated privilege level (and ideally inaccessible or * at least read-only at lesser-privileged levels). */ #define DMA_ATTR_PRIVILEGED (1UL << 9) /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a * given device and there may be a translation between the CPU physical address * space and the bus address space. * * DMA_MAPPING_ERROR is the magic error code if a mapping failed. It should not * be used directly in drivers, but checked for using dma_mapping_error() * instead. */ #define DMA_MAPPING_ERROR (~(dma_addr_t)0) #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) #ifdef CONFIG_DMA_API_DEBUG void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr); void debug_dma_map_single(struct device *dev, const void *addr, unsigned long len); #else static inline void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { } static inline void debug_dma_map_single(struct device *dev, const void *addr, unsigned long len) { } #endif /* CONFIG_DMA_API_DEBUG */ #ifdef CONFIG_HAS_DMA static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { debug_dma_mapping_error(dev, dma_addr); if (unlikely(dma_addr == DMA_MAPPING_ERROR)) return -ENOMEM; return 0; } dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); int dma_map_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs); dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs); void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs); void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs); void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs); bool dma_can_mmap(struct device *dev); bool dma_pci_p2pdma_supported(struct device *dev); int dma_set_mask(struct device *dev, u64 mask); int dma_set_coherent_mask(struct device *dev, u64 mask); u64 dma_get_required_mask(struct device *dev); bool dma_addressing_limited(struct device *dev); size_t dma_max_mapping_size(struct device *dev); size_t dma_opt_mapping_size(struct device *dev); unsigned long dma_get_merge_boundary(struct device *dev); struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs); void dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir); void *dma_vmap_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt); void dma_vunmap_noncontiguous(struct device *dev, void *vaddr); int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, size_t size, struct sg_table *sgt); #else /* CONFIG_HAS_DMA */ static inline dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs) { return DMA_MAPPING_ERROR; } static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { } static inline unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { return 0; } static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { } static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) { return -EOPNOTSUPP; } static inline dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { return DMA_MAPPING_ERROR; } static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { } static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { return -ENOMEM; } static inline void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { return NULL; } static void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) { } static inline void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) { return NULL; } static inline void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle) { } static inline int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { return -ENXIO; } static inline int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { return -ENXIO; } static inline bool dma_can_mmap(struct device *dev) { return false; } static inline bool dma_pci_p2pdma_supported(struct device *dev) { return false; } static inline int dma_set_mask(struct device *dev, u64 mask) { return -EIO; } static inline int dma_set_coherent_mask(struct device *dev, u64 mask) { return -EIO; } static inline u64 dma_get_required_mask(struct device *dev) { return 0; } static inline bool dma_addressing_limited(struct device *dev) { return false; } static inline size_t dma_max_mapping_size(struct device *dev) { return 0; } static inline size_t dma_opt_mapping_size(struct device *dev) { return 0; } static inline unsigned long dma_get_merge_boundary(struct device *dev) { return 0; } static inline struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, enum dma_data_direction dir, gfp_t gfp, unsigned long attrs) { return NULL; } static inline void dma_free_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt, enum dma_data_direction dir) { } static inline void *dma_vmap_noncontiguous(struct device *dev, size_t size, struct sg_table *sgt) { return NULL; } static inline void dma_vunmap_noncontiguous(struct device *dev, void *vaddr) { } static inline int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, size_t size, struct sg_table *sgt) { return -EINVAL; } #endif /* CONFIG_HAS_DMA */ #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) void __dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir); void __dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir); void __dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir); void __dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir); bool __dma_need_sync(struct device *dev, dma_addr_t dma_addr); static inline bool dma_dev_need_sync(const struct device *dev) { /* Always call DMA sync operations when debugging is enabled */ return !dev->dma_skip_sync || IS_ENABLED(CONFIG_DMA_API_DEBUG); } static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { if (dma_dev_need_sync(dev)) __dma_sync_single_for_cpu(dev, addr, size, dir); } static inline void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { if (dma_dev_need_sync(dev)) __dma_sync_single_for_device(dev, addr, size, dir); } static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { if (dma_dev_need_sync(dev)) __dma_sync_sg_for_cpu(dev, sg, nelems, dir); } static inline void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { if (dma_dev_need_sync(dev)) __dma_sync_sg_for_device(dev, sg, nelems, dir); } static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) { return dma_dev_need_sync(dev) ? __dma_need_sync(dev, dma_addr) : false; } #else /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */ static inline bool dma_dev_need_sync(const struct device *dev) { return false; } static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { } static inline void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { } static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { } static inline void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { } static inline bool dma_need_sync(struct device *dev, dma_addr_t dma_addr) { return false; } #endif /* !CONFIG_HAS_DMA || !CONFIG_DMA_NEED_SYNC */ struct page *dma_alloc_pages(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp); void dma_free_pages(struct device *dev, size_t size, struct page *page, dma_addr_t dma_handle, enum dma_data_direction dir); int dma_mmap_pages(struct device *dev, struct vm_area_struct *vma, size_t size, struct page *page); static inline void *dma_alloc_noncoherent(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp) { struct page *page = dma_alloc_pages(dev, size, dma_handle, dir, gfp); return page ? page_address(page) : NULL; } static inline void dma_free_noncoherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, enum dma_data_direction dir) { dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir); } static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs) { /* DMA must never operate on areas that might be remapped. */ if (dev_WARN_ONCE(dev, is_vmalloc_addr(ptr), "rejecting DMA map of vmalloc memory\n")) return DMA_MAPPING_ERROR; debug_dma_map_single(dev, ptr, size); return dma_map_page_attrs(dev, virt_to_page(ptr), offset_in_page(ptr), size, dir, attrs); } static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { return dma_unmap_page_attrs(dev, addr, size, dir, attrs); } static inline void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { return dma_sync_single_for_cpu(dev, addr + offset, size, dir); } static inline void dma_sync_single_range_for_device(struct device *dev, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { return dma_sync_single_for_device(dev, addr + offset, size, dir); } /** * dma_unmap_sgtable - Unmap the given buffer for DMA * @dev: The device for which to perform the DMA operation * @sgt: The sg_table object describing the buffer * @dir: DMA direction * @attrs: Optional DMA attributes for the unmap operation * * Unmaps a buffer described by a scatterlist stored in the given sg_table * object for the @dir DMA operation by the @dev device. After this function * the ownership of the buffer is transferred back to the CPU domain. */ static inline void dma_unmap_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) { dma_unmap_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs); } /** * dma_sync_sgtable_for_cpu - Synchronize the given buffer for CPU access * @dev: The device for which to perform the DMA operation * @sgt: The sg_table object describing the buffer * @dir: DMA direction * * Performs the needed cache synchronization and moves the ownership of the * buffer back to the CPU domain, so it is safe to perform any access to it * by the CPU. Before doing any further DMA operations, one has to transfer * the ownership of the buffer back to the DMA domain by calling the * dma_sync_sgtable_for_device(). */ static inline void dma_sync_sgtable_for_cpu(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir) { dma_sync_sg_for_cpu(dev, sgt->sgl, sgt->orig_nents, dir); } /** * dma_sync_sgtable_for_device - Synchronize the given buffer for DMA * @dev: The device for which to perform the DMA operation * @sgt: The sg_table object describing the buffer * @dir: DMA direction * * Performs the needed cache synchronization and moves the ownership of the * buffer back to the DMA domain, so it is safe to perform the DMA operation. * Once finished, one has to call dma_sync_sgtable_for_cpu() or * dma_unmap_sgtable(). */ static inline void dma_sync_sgtable_for_device(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir) { dma_sync_sg_for_device(dev, sgt->sgl, sgt->orig_nents, dir); } #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0) #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) #define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0) #define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0) #define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0) #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, 0) #define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, 0) bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size); static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { return dma_alloc_attrs(dev, size, dma_handle, gfp, (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0); } static inline void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle) { return dma_free_attrs(dev, size, cpu_addr, dma_handle, 0); } static inline u64 dma_get_mask(struct device *dev) { if (dev->dma_mask && *dev->dma_mask) return *dev->dma_mask; return DMA_BIT_MASK(32); } /* * Set both the DMA mask and the coherent DMA mask to the same thing. * Note that we don't check the return value from dma_set_coherent_mask() * as the DMA API guarantees that the coherent DMA mask can be set to * the same or smaller than the streaming DMA mask. */ static inline int dma_set_mask_and_coherent(struct device *dev, u64 mask) { int rc = dma_set_mask(dev, mask); if (rc == 0) dma_set_coherent_mask(dev, mask); return rc; } /* * Similar to the above, except it deals with the case where the device * does not have dev->dma_mask appropriately setup. */ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask) { dev->dma_mask = &dev->coherent_dma_mask; return dma_set_mask_and_coherent(dev, mask); } static inline unsigned int dma_get_max_seg_size(struct device *dev) { if (dev->dma_parms && dev->dma_parms->max_segment_size) return dev->dma_parms->max_segment_size; return SZ_64K; } static inline void dma_set_max_seg_size(struct device *dev, unsigned int size) { if (WARN_ON_ONCE(!dev->dma_parms)) return; dev->dma_parms->max_segment_size = size; } static inline unsigned long dma_get_seg_boundary(struct device *dev) { if (dev->dma_parms && dev->dma_parms->segment_boundary_mask) return dev->dma_parms->segment_boundary_mask; return ULONG_MAX; } /** * dma_get_seg_boundary_nr_pages - return the segment boundary in "page" units * @dev: device to guery the boundary for * @page_shift: ilog() of the IOMMU page size * * Return the segment boundary in IOMMU page units (which may be different from * the CPU page size) for the passed in device. * * If @dev is NULL a boundary of U32_MAX is assumed, this case is just for * non-DMA API callers. */ static inline unsigned long dma_get_seg_boundary_nr_pages(struct device *dev, unsigned int page_shift) { if (!dev) return (U32_MAX >> page_shift) + 1; return (dma_get_seg_boundary(dev) >> page_shift) + 1; } static inline void dma_set_seg_boundary(struct device *dev, unsigned long mask) { if (WARN_ON_ONCE(!dev->dma_parms)) return; dev->dma_parms->segment_boundary_mask = mask; } static inline unsigned int dma_get_min_align_mask(struct device *dev) { if (dev->dma_parms) return dev->dma_parms->min_align_mask; return 0; } static inline void dma_set_min_align_mask(struct device *dev, unsigned int min_align_mask) { if (WARN_ON_ONCE(!dev->dma_parms)) return; dev->dma_parms->min_align_mask = min_align_mask; } #ifndef dma_get_cache_alignment static inline int dma_get_cache_alignment(void) { #ifdef ARCH_HAS_DMA_MINALIGN return ARCH_DMA_MINALIGN; #endif return 1; } #endif static inline void *dmam_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { return dmam_alloc_attrs(dev, size, dma_handle, gfp, (gfp & __GFP_NOWARN) ? DMA_ATTR_NO_WARN : 0); } static inline void *dma_alloc_wc(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t gfp) { unsigned long attrs = DMA_ATTR_WRITE_COMBINE; if (gfp & __GFP_NOWARN) attrs |= DMA_ATTR_NO_WARN; return dma_alloc_attrs(dev, size, dma_addr, gfp, attrs); } static inline void dma_free_wc(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr) { return dma_free_attrs(dev, size, cpu_addr, dma_addr, DMA_ATTR_WRITE_COMBINE); } static inline int dma_mmap_wc(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size) { return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, DMA_ATTR_WRITE_COMBINE); } #ifdef CONFIG_NEED_DMA_MAP_STATE #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME #define DEFINE_DMA_UNMAP_LEN(LEN_NAME) __u32 LEN_NAME #define dma_unmap_addr(PTR, ADDR_NAME) ((PTR)->ADDR_NAME) #define dma_unmap_addr_set(PTR, ADDR_NAME, VAL) (((PTR)->ADDR_NAME) = (VAL)) #define dma_unmap_len(PTR, LEN_NAME) ((PTR)->LEN_NAME) #define dma_unmap_len_set(PTR, LEN_NAME, VAL) (((PTR)->LEN_NAME) = (VAL)) #else #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) #define DEFINE_DMA_UNMAP_LEN(LEN_NAME) #define dma_unmap_addr(PTR, ADDR_NAME) (0) #define dma_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) #define dma_unmap_len(PTR, LEN_NAME) (0) #define dma_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) #endif #endif /* _LINUX_DMA_MAPPING_H */ |
2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Net1080 based USB host-to-host cables * Copyright (C) 2000-2005 by David Brownell */ // #define DEBUG // error path messages, extra info // #define VERBOSE // more; success messages #include <linux/module.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> #include <linux/workqueue.h> #include <linux/mii.h> #include <linux/usb.h> #include <linux/usb/usbnet.h> #include <linux/slab.h> #include <linux/unaligned.h> /* * Netchip 1080 driver ... http://www.netchip.com * (Sept 2004: End-of-life announcement has been sent.) * Used in (some) LapLink cables */ #define frame_errors data[1] /* * NetChip framing of ethernet packets, supporting additional error * checks for links that may drop bulk packets from inside messages. * Odd USB length == always short read for last usb packet. * - nc_header * - Ethernet header (14 bytes) * - payload * - (optional padding byte, if needed so length becomes odd) * - nc_trailer * * This framing is to be avoided for non-NetChip devices. */ struct nc_header { // packed: __le16 hdr_len; // sizeof nc_header (LE, all) __le16 packet_len; // payload size (including ethhdr) __le16 packet_id; // detects dropped packets #define MIN_HEADER 6 // all else is optional, and must start with: // __le16 vendorId; // from usb-if // __le16 productId; } __packed; #define PAD_BYTE ((unsigned char)0xAC) struct nc_trailer { __le16 packet_id; } __packed; // packets may use FLAG_FRAMING_NC and optional pad #define FRAMED_SIZE(mtu) (sizeof (struct nc_header) \ + sizeof (struct ethhdr) \ + (mtu) \ + 1 \ + sizeof (struct nc_trailer)) #define MIN_FRAMED FRAMED_SIZE(0) /* packets _could_ be up to 64KB... */ #define NC_MAX_PACKET 32767 /* * Zero means no timeout; else, how long a 64 byte bulk packet may be queued * before the hardware drops it. If that's done, the driver will need to * frame network packets to guard against the dropped USB packets. The win32 * driver sets this for both sides of the link. */ #define NC_READ_TTL_MS ((u8)255) // ms /* * We ignore most registers and EEPROM contents. */ #define REG_USBCTL ((u8)0x04) #define REG_TTL ((u8)0x10) #define REG_STATUS ((u8)0x11) /* * Vendor specific requests to read/write data */ #define REQUEST_REGISTER ((u8)0x10) #define REQUEST_EEPROM ((u8)0x11) static int nc_vendor_read(struct usbnet *dev, u8 req, u8 regnum, u16 *retval_ptr) { int status = usbnet_read_cmd(dev, req, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, regnum, retval_ptr, sizeof *retval_ptr); if (status > 0) status = 0; if (!status) le16_to_cpus(retval_ptr); return status; } static inline int nc_register_read(struct usbnet *dev, u8 regnum, u16 *retval_ptr) { return nc_vendor_read(dev, REQUEST_REGISTER, regnum, retval_ptr); } static void nc_vendor_write(struct usbnet *dev, u8 req, u8 regnum, u16 value) { usbnet_write_cmd(dev, req, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, regnum, NULL, 0); } static inline void nc_register_write(struct usbnet *dev, u8 regnum, u16 value) { nc_vendor_write(dev, REQUEST_REGISTER, regnum, value); } #if 0 static void nc_dump_registers(struct usbnet *dev) { u8 reg; u16 *vp = kmalloc(sizeof (u16)); if (!vp) return; netdev_dbg(dev->net, "registers:\n"); for (reg = 0; reg < 0x20; reg++) { int retval; // reading some registers is trouble if (reg >= 0x08 && reg <= 0xf) continue; if (reg >= 0x12 && reg <= 0x1e) continue; retval = nc_register_read(dev, reg, vp); if (retval < 0) netdev_dbg(dev->net, "reg [0x%x] ==> error %d\n", reg, retval); else netdev_dbg(dev->net, "reg [0x%x] = 0x%x\n", reg, *vp); } kfree(vp); } #endif /*-------------------------------------------------------------------------*/ /* * Control register */ #define USBCTL_WRITABLE_MASK 0x1f0f // bits 15-13 reserved, r/o #define USBCTL_ENABLE_LANG (1 << 12) #define USBCTL_ENABLE_MFGR (1 << 11) #define USBCTL_ENABLE_PROD (1 << 10) #define USBCTL_ENABLE_SERIAL (1 << 9) #define USBCTL_ENABLE_DEFAULTS (1 << 8) // bits 7-4 reserved, r/o #define USBCTL_FLUSH_OTHER (1 << 3) #define USBCTL_FLUSH_THIS (1 << 2) #define USBCTL_DISCONN_OTHER (1 << 1) #define USBCTL_DISCONN_THIS (1 << 0) static inline void nc_dump_usbctl(struct usbnet *dev, u16 usbctl) { netif_dbg(dev, link, dev->net, "net1080 %s-%s usbctl 0x%x:%s%s%s%s%s; this%s%s; other%s%s; r/o 0x%x\n", dev->udev->bus->bus_name, dev->udev->devpath, usbctl, (usbctl & USBCTL_ENABLE_LANG) ? " lang" : "", (usbctl & USBCTL_ENABLE_MFGR) ? " mfgr" : "", (usbctl & USBCTL_ENABLE_PROD) ? " prod" : "", (usbctl & USBCTL_ENABLE_SERIAL) ? " serial" : "", (usbctl & USBCTL_ENABLE_DEFAULTS) ? " defaults" : "", (usbctl & USBCTL_FLUSH_THIS) ? " FLUSH" : "", (usbctl & USBCTL_DISCONN_THIS) ? " DIS" : "", (usbctl & USBCTL_FLUSH_OTHER) ? " FLUSH" : "", (usbctl & USBCTL_DISCONN_OTHER) ? " DIS" : "", usbctl & ~USBCTL_WRITABLE_MASK); } /*-------------------------------------------------------------------------*/ /* * Status register */ #define STATUS_PORT_A (1 << 15) #define STATUS_CONN_OTHER (1 << 14) #define STATUS_SUSPEND_OTHER (1 << 13) #define STATUS_MAILBOX_OTHER (1 << 12) #define STATUS_PACKETS_OTHER(n) (((n) >> 8) & 0x03) #define STATUS_CONN_THIS (1 << 6) #define STATUS_SUSPEND_THIS (1 << 5) #define STATUS_MAILBOX_THIS (1 << 4) #define STATUS_PACKETS_THIS(n) (((n) >> 0) & 0x03) #define STATUS_UNSPEC_MASK 0x0c8c #define STATUS_NOISE_MASK ((u16)~(0x0303|STATUS_UNSPEC_MASK)) static inline void nc_dump_status(struct usbnet *dev, u16 status) { netif_dbg(dev, link, dev->net, "net1080 %s-%s status 0x%x: this (%c) PKT=%d%s%s%s; other PKT=%d%s%s%s; unspec 0x%x\n", dev->udev->bus->bus_name, dev->udev->devpath, status, // XXX the packet counts don't seem right // (1 at reset, not 0); maybe UNSPEC too (status & STATUS_PORT_A) ? 'A' : 'B', STATUS_PACKETS_THIS(status), (status & STATUS_CONN_THIS) ? " CON" : "", (status & STATUS_SUSPEND_THIS) ? " SUS" : "", (status & STATUS_MAILBOX_THIS) ? " MBOX" : "", STATUS_PACKETS_OTHER(status), (status & STATUS_CONN_OTHER) ? " CON" : "", (status & STATUS_SUSPEND_OTHER) ? " SUS" : "", (status & STATUS_MAILBOX_OTHER) ? " MBOX" : "", status & STATUS_UNSPEC_MASK); } /*-------------------------------------------------------------------------*/ /* * TTL register */ #define TTL_OTHER(ttl) (0x00ff & (ttl >> 8)) #define MK_TTL(this,other) ((u16)(((other)<<8)|(0x00ff&(this)))) /*-------------------------------------------------------------------------*/ static int net1080_reset(struct usbnet *dev) { u16 usbctl, status, ttl; u16 vp; int retval; // nc_dump_registers(dev); if ((retval = nc_register_read(dev, REG_STATUS, &vp)) < 0) { netdev_dbg(dev->net, "can't read %s-%s status: %d\n", dev->udev->bus->bus_name, dev->udev->devpath, retval); goto done; } status = vp; nc_dump_status(dev, status); if ((retval = nc_register_read(dev, REG_USBCTL, &vp)) < 0) { netdev_dbg(dev->net, "can't read USBCTL, %d\n", retval); goto done; } usbctl = vp; nc_dump_usbctl(dev, usbctl); nc_register_write(dev, REG_USBCTL, USBCTL_FLUSH_THIS | USBCTL_FLUSH_OTHER); if ((retval = nc_register_read(dev, REG_TTL, &vp)) < 0) { netdev_dbg(dev->net, "can't read TTL, %d\n", retval); goto done; } ttl = vp; nc_register_write(dev, REG_TTL, MK_TTL(NC_READ_TTL_MS, TTL_OTHER(ttl)) ); netdev_dbg(dev->net, "assigned TTL, %d ms\n", NC_READ_TTL_MS); netif_info(dev, link, dev->net, "port %c, peer %sconnected\n", (status & STATUS_PORT_A) ? 'A' : 'B', (status & STATUS_CONN_OTHER) ? "" : "dis"); retval = 0; done: return retval; } static int net1080_check_connect(struct usbnet *dev) { int retval; u16 status; u16 vp; retval = nc_register_read(dev, REG_STATUS, &vp); status = vp; if (retval != 0) { netdev_dbg(dev->net, "net1080_check_conn read - %d\n", retval); return retval; } if ((status & STATUS_CONN_OTHER) != STATUS_CONN_OTHER) return -ENOLINK; return 0; } static void nc_ensure_sync(struct usbnet *dev) { if (++dev->frame_errors <= 5) return; if (usbnet_write_cmd_async(dev, REQUEST_REGISTER, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, USBCTL_FLUSH_THIS | USBCTL_FLUSH_OTHER, REG_USBCTL, NULL, 0)) return; netif_dbg(dev, rx_err, dev->net, "flush net1080; too many framing errors\n"); dev->frame_errors = 0; } static int net1080_rx_fixup(struct usbnet *dev, struct sk_buff *skb) { struct nc_header *header; struct nc_trailer *trailer; u16 hdr_len, packet_len; /* This check is no longer done by usbnet */ if (skb->len < dev->net->hard_header_len) return 0; if (!(skb->len & 0x01)) { netdev_dbg(dev->net, "rx framesize %d range %d..%d mtu %d\n", skb->len, dev->net->hard_header_len, dev->hard_mtu, dev->net->mtu); dev->net->stats.rx_frame_errors++; nc_ensure_sync(dev); return 0; } header = (struct nc_header *) skb->data; hdr_len = le16_to_cpup(&header->hdr_len); packet_len = le16_to_cpup(&header->packet_len); if (FRAMED_SIZE(packet_len) > NC_MAX_PACKET) { dev->net->stats.rx_frame_errors++; netdev_dbg(dev->net, "packet too big, %d\n", packet_len); nc_ensure_sync(dev); return 0; } else if (hdr_len < MIN_HEADER) { dev->net->stats.rx_frame_errors++; netdev_dbg(dev->net, "header too short, %d\n", hdr_len); nc_ensure_sync(dev); return 0; } else if (hdr_len > MIN_HEADER) { // out of band data for us? netdev_dbg(dev->net, "header OOB, %d bytes\n", hdr_len - MIN_HEADER); nc_ensure_sync(dev); // switch (vendor/product ids) { ... } } skb_pull(skb, hdr_len); trailer = (struct nc_trailer *) (skb->data + skb->len - sizeof *trailer); skb_trim(skb, skb->len - sizeof *trailer); if ((packet_len & 0x01) == 0) { if (skb->data [packet_len] != PAD_BYTE) { dev->net->stats.rx_frame_errors++; netdev_dbg(dev->net, "bad pad\n"); return 0; } skb_trim(skb, skb->len - 1); } if (skb->len != packet_len) { dev->net->stats.rx_frame_errors++; netdev_dbg(dev->net, "bad packet len %d (expected %d)\n", skb->len, packet_len); nc_ensure_sync(dev); return 0; } if (header->packet_id != get_unaligned(&trailer->packet_id)) { dev->net->stats.rx_fifo_errors++; netdev_dbg(dev->net, "(2+ dropped) rx packet_id mismatch 0x%x 0x%x\n", le16_to_cpu(header->packet_id), le16_to_cpu(trailer->packet_id)); return 0; } #if 0 netdev_dbg(dev->net, "frame <rx h %d p %d id %d\n", header->hdr_len, header->packet_len, header->packet_id); #endif dev->frame_errors = 0; return 1; } static struct sk_buff * net1080_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { struct sk_buff *skb2; struct nc_header *header = NULL; struct nc_trailer *trailer = NULL; int padlen = sizeof (struct nc_trailer); int len = skb->len; if (!((len + padlen + sizeof (struct nc_header)) & 0x01)) padlen++; if (!skb_cloned(skb)) { int headroom = skb_headroom(skb); int tailroom = skb_tailroom(skb); if (padlen <= tailroom && sizeof(struct nc_header) <= headroom) /* There's enough head and tail room */ goto encapsulate; if ((sizeof (struct nc_header) + padlen) < (headroom + tailroom)) { /* There's enough total room, so just readjust */ skb->data = memmove(skb->head + sizeof (struct nc_header), skb->data, skb->len); skb_set_tail_pointer(skb, len); goto encapsulate; } } /* Create a new skb to use with the correct size */ skb2 = skb_copy_expand(skb, sizeof (struct nc_header), padlen, flags); dev_kfree_skb_any(skb); if (!skb2) return skb2; skb = skb2; encapsulate: /* header first */ header = skb_push(skb, sizeof *header); header->hdr_len = cpu_to_le16(sizeof (*header)); header->packet_len = cpu_to_le16(len); header->packet_id = cpu_to_le16((u16)dev->xid++); /* maybe pad; then trailer */ if (!((skb->len + sizeof *trailer) & 0x01)) skb_put_u8(skb, PAD_BYTE); trailer = skb_put(skb, sizeof *trailer); put_unaligned(header->packet_id, &trailer->packet_id); #if 0 netdev_dbg(dev->net, "frame >tx h %d p %d id %d\n", header->hdr_len, header->packet_len, header->packet_id); #endif return skb; } static int net1080_bind(struct usbnet *dev, struct usb_interface *intf) { unsigned extra = sizeof (struct nc_header) + 1 + sizeof (struct nc_trailer); dev->net->hard_header_len += extra; dev->rx_urb_size = dev->net->hard_header_len + dev->net->mtu; dev->hard_mtu = NC_MAX_PACKET; return usbnet_get_endpoints (dev, intf); } static const struct driver_info net1080_info = { .description = "NetChip TurboCONNECT", .flags = FLAG_POINTTOPOINT | FLAG_FRAMING_NC, .bind = net1080_bind, .reset = net1080_reset, .check_connect = net1080_check_connect, .rx_fixup = net1080_rx_fixup, .tx_fixup = net1080_tx_fixup, }; static const struct usb_device_id products [] = { { USB_DEVICE(0x0525, 0x1080), // NetChip ref design .driver_info = (unsigned long) &net1080_info, }, { USB_DEVICE(0x06D0, 0x0622), // Laplink Gold .driver_info = (unsigned long) &net1080_info, }, { }, // END }; MODULE_DEVICE_TABLE(usb, products); static struct usb_driver net1080_driver = { .name = "net1080", .id_table = products, .probe = usbnet_probe, .disconnect = usbnet_disconnect, .suspend = usbnet_suspend, .resume = usbnet_resume, .disable_hub_initiated_lpm = 1, }; module_usb_driver(net1080_driver); MODULE_AUTHOR("David Brownell"); MODULE_DESCRIPTION("NetChip 1080 based USB Host-to-Host Links"); MODULE_LICENSE("GPL"); |
1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,ip type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ /* 5 skbinfo support added */ #define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,ip"); /* Type specific function prefix */ #define HTYPE hash_ipportip /* IPv4 variant */ /* Member elements */ struct hash_ipportip4_elem { __be32 ip; __be32 ip2; __be16 port; u8 proto; u8 padding; }; static bool hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, const struct hash_ipportip4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->ip2 == ip2->ip2 && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipportip4_data_list(struct sk_buff *skb, const struct hash_ipportip4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportip4_data_next(struct hash_ipportip4_elem *next, const struct hash_ipportip4_elem *d) { next->ip = d->ip; next->port = d->port; } /* Common functions */ #define MTYPE hash_ipportip4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipportip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to, i = 0; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++, i++) { e.ip = htonl(ip); e.port = htons(p); if (i > IPSET_MAX_RANGE) { hash_ipportip4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } } return ret; } /* IPv6 variant */ struct hash_ipportip6_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 proto; u8 padding; }; /* Common functions */ static bool hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, const struct hash_ipportip6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipportip6_data_list(struct sk_buff *skb, const struct hash_ipportip6_elem *data) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportip6_data_next(struct hash_ipportip6_elem *next, const struct hash_ipportip6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipportip6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipportip6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_ipportip_type __read_mostly = { .name = "hash:ip,port,ip", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2, .dimension = IPSET_DIM_THREE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipportip_init(void) { return ip_set_type_register(&hash_ipportip_type); } static void __exit hash_ipportip_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipportip_type); } module_init(hash_ipportip_init); module_exit(hash_ipportip_fini); |
55 9 215 44 26 117 36 112 17 128 68 21 156 32 23 14 151 156 38 178 38 178 178 178 18 68 68 68 86 85 144 97 178 178 88 178 68 4 68 68 68 24 51 57 23 39 6 33 4 31 26 15 5 15 33 54 54 4 55 41 6 46 45 30 17 55 99 99 2 99 99 4 99 99 43 43 6 41 36 43 86 85 3 85 50 7 46 86 178 91 90 68 68 12 60 86 86 40 86 44 2 45 40 5 61 61 13 13 13 13 13 12 13 13 12 13 13 13 98 194 50 4 128 128 124 99 125 99 30 125 2 2 149 149 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 | /* * net/tipc/name_table.c: TIPC name table code * * Copyright (c) 2000-2006, 2014-2018, Ericsson AB * Copyright (c) 2004-2008, 2010-2014, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include <net/sock.h> #include <linux/list_sort.h> #include <linux/rbtree_augmented.h> #include "core.h" #include "netlink.h" #include "name_table.h" #include "name_distr.h" #include "subscr.h" #include "bcast.h" #include "addr.h" #include "node.h" #include "group.h" /** * struct service_range - container for all bindings of a service range * @lower: service range lower bound * @upper: service range upper bound * @tree_node: member of service range RB tree * @max: largest 'upper' in this node subtree * @local_publ: list of identical publications made from this node * Used by closest_first lookup and multicast lookup algorithm * @all_publ: all publications identical to this one, whatever node and scope * Used by round-robin lookup algorithm */ struct service_range { u32 lower; u32 upper; struct rb_node tree_node; u32 max; struct list_head local_publ; struct list_head all_publ; }; /** * struct tipc_service - container for all published instances of a service type * @type: 32 bit 'type' value for service * @publ_cnt: increasing counter for publications in this service * @ranges: rb tree containing all service ranges for this service * @service_list: links to adjacent name ranges in hash chain * @subscriptions: list of subscriptions for this service type * @lock: spinlock controlling access to pertaining service ranges/publications * @rcu: RCU callback head used for deferred freeing */ struct tipc_service { u32 type; u32 publ_cnt; struct rb_root ranges; struct hlist_node service_list; struct list_head subscriptions; spinlock_t lock; /* Covers service range list */ struct rcu_head rcu; }; #define service_range_upper(sr) ((sr)->upper) RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks, struct service_range, tree_node, u32, max, service_range_upper) #define service_range_entry(rbtree_node) \ (container_of(rbtree_node, struct service_range, tree_node)) #define service_range_overlap(sr, start, end) \ ((sr)->lower <= (end) && (sr)->upper >= (start)) /** * service_range_foreach_match - iterate over tipc service rbtree for each * range match * @sr: the service range pointer as a loop cursor * @sc: the pointer to tipc service which holds the service range rbtree * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching */ #define service_range_foreach_match(sr, sc, start, end) \ for (sr = service_range_match_first((sc)->ranges.rb_node, \ start, \ end); \ sr; \ sr = service_range_match_next(&(sr)->tree_node, \ start, \ end)) /** * service_range_match_first - find first service range matching a range * @n: the root node of service range rbtree for searching * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching * * Return: the leftmost service range node in the rbtree that overlaps the * specific range if any. Otherwise, returns NULL. */ static struct service_range *service_range_match_first(struct rb_node *n, u32 start, u32 end) { struct service_range *sr; struct rb_node *l, *r; /* Non overlaps in tree at all? */ if (!n || service_range_entry(n)->max < start) return NULL; while (n) { l = n->rb_left; if (l && service_range_entry(l)->max >= start) { /* A leftmost overlap range node must be one in the left * subtree. If not, it has lower > end, then nodes on * the right side cannot satisfy the condition either. */ n = l; continue; } /* No one in the left subtree can match, return if this node is * an overlap i.e. leftmost. */ sr = service_range_entry(n); if (service_range_overlap(sr, start, end)) return sr; /* Ok, try to lookup on the right side */ r = n->rb_right; if (sr->lower <= end && r && service_range_entry(r)->max >= start) { n = r; continue; } break; } return NULL; } /** * service_range_match_next - find next service range matching a range * @n: a node in service range rbtree from which the searching starts * @start: beginning of the search range (end >= start) for matching * @end: end of the search range (end >= start) for matching * * Return: the next service range node to the given node in the rbtree that * overlaps the specific range if any. Otherwise, returns NULL. */ static struct service_range *service_range_match_next(struct rb_node *n, u32 start, u32 end) { struct service_range *sr; struct rb_node *p, *r; while (n) { r = n->rb_right; if (r && service_range_entry(r)->max >= start) /* A next overlap range node must be one in the right * subtree. If not, it has lower > end, then any next * successor (- an ancestor) of this node cannot * satisfy the condition either. */ return service_range_match_first(r, start, end); /* No one in the right subtree can match, go up to find an * ancestor of this node which is parent of a left-hand child. */ while ((p = rb_parent(n)) && n == p->rb_right) n = p; if (!p) break; /* Return if this ancestor is an overlap */ sr = service_range_entry(p); if (service_range_overlap(sr, start, end)) return sr; /* Ok, try to lookup more from this ancestor */ if (sr->lower <= end) { n = p; continue; } break; } return NULL; } static int hash(int x) { return x & (TIPC_NAMETBL_SIZE - 1); } /** * tipc_publ_create - create a publication structure * @ua: the service range the user is binding to * @sk: the address of the socket that is bound * @key: publication key */ static struct publication *tipc_publ_create(struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct publication *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return NULL; p->sr = ua->sr; p->sk = *sk; p->scope = ua->scope; p->key = key; INIT_LIST_HEAD(&p->binding_sock); INIT_LIST_HEAD(&p->binding_node); INIT_LIST_HEAD(&p->local_publ); INIT_LIST_HEAD(&p->all_publ); INIT_LIST_HEAD(&p->list); return p; } /** * tipc_service_create - create a service structure for the specified 'type' * @net: network namespace * @ua: address representing the service to be bound * * Allocates a single range structure and sets it to all 0's. */ static struct tipc_service *tipc_service_create(struct net *net, struct tipc_uaddr *ua) { struct name_table *nt = tipc_name_table(net); struct tipc_service *service; struct hlist_head *hd; service = kzalloc(sizeof(*service), GFP_ATOMIC); if (!service) { pr_warn("Service creation failed, no memory\n"); return NULL; } spin_lock_init(&service->lock); service->type = ua->sr.type; service->ranges = RB_ROOT; INIT_HLIST_NODE(&service->service_list); INIT_LIST_HEAD(&service->subscriptions); hd = &nt->services[hash(ua->sr.type)]; hlist_add_head_rcu(&service->service_list, hd); return service; } /* tipc_service_find_range - find service range matching publication parameters */ static struct service_range *tipc_service_find_range(struct tipc_service *sc, struct tipc_uaddr *ua) { struct service_range *sr; service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { /* Look for exact match */ if (sr->lower == ua->sr.lower && sr->upper == ua->sr.upper) return sr; } return NULL; } static struct service_range *tipc_service_create_range(struct tipc_service *sc, struct publication *p) { struct rb_node **n, *parent = NULL; struct service_range *sr; u32 lower = p->sr.lower; u32 upper = p->sr.upper; n = &sc->ranges.rb_node; while (*n) { parent = *n; sr = service_range_entry(parent); if (lower == sr->lower && upper == sr->upper) return sr; if (sr->max < upper) sr->max = upper; if (lower <= sr->lower) n = &parent->rb_left; else n = &parent->rb_right; } sr = kzalloc(sizeof(*sr), GFP_ATOMIC); if (!sr) return NULL; sr->lower = lower; sr->upper = upper; sr->max = upper; INIT_LIST_HEAD(&sr->local_publ); INIT_LIST_HEAD(&sr->all_publ); rb_link_node(&sr->tree_node, parent, n); rb_insert_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); return sr; } static bool tipc_service_insert_publ(struct net *net, struct tipc_service *sc, struct publication *p) { struct tipc_subscription *sub, *tmp; struct service_range *sr; struct publication *_p; u32 node = p->sk.node; bool first = false; bool res = false; u32 key = p->key; spin_lock_bh(&sc->lock); sr = tipc_service_create_range(sc, p); if (!sr) goto exit; first = list_empty(&sr->all_publ); /* Return if the publication already exists */ list_for_each_entry(_p, &sr->all_publ, all_publ) { if (_p->key == key && (!_p->sk.node || _p->sk.node == node)) { pr_debug("Failed to bind duplicate %u,%u,%u/%u:%u/%u\n", p->sr.type, p->sr.lower, p->sr.upper, node, p->sk.ref, key); goto exit; } } if (in_own_node(net, p->sk.node)) list_add(&p->local_publ, &sr->local_publ); list_add(&p->all_publ, &sr->all_publ); p->id = sc->publ_cnt++; /* Any subscriptions waiting for notification? */ list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, first); } res = true; exit: if (!res) pr_warn("Failed to bind to %u,%u,%u\n", p->sr.type, p->sr.lower, p->sr.upper); spin_unlock_bh(&sc->lock); return res; } /** * tipc_service_remove_publ - remove a publication from a service * @r: service_range to remove publication from * @sk: address publishing socket * @key: target publication key */ static struct publication *tipc_service_remove_publ(struct service_range *r, struct tipc_socket_addr *sk, u32 key) { struct publication *p; u32 node = sk->node; list_for_each_entry(p, &r->all_publ, all_publ) { if (p->key != key || (node && node != p->sk.node)) continue; list_del(&p->all_publ); list_del(&p->local_publ); return p; } return NULL; } /* * Code reused: time_after32() for the same purpose */ #define publication_after(pa, pb) time_after32((pa)->id, (pb)->id) static int tipc_publ_sort(void *priv, const struct list_head *a, const struct list_head *b) { struct publication *pa, *pb; pa = container_of(a, struct publication, list); pb = container_of(b, struct publication, list); return publication_after(pa, pb); } /** * tipc_service_subscribe - attach a subscription, and optionally * issue the prescribed number of events if there is any service * range overlapping with the requested range * @service: the tipc_service to attach the @sub to * @sub: the subscription to attach */ static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) { struct publication *p, *first, *tmp; struct list_head publ_list; struct service_range *sr; u32 filter, lower, upper; filter = sub->s.filter; lower = sub->s.seq.lower; upper = sub->s.seq.upper; tipc_sub_get(sub); list_add(&sub->service_list, &service->subscriptions); if (filter & TIPC_SUB_NO_STATUS) return; INIT_LIST_HEAD(&publ_list); service_range_foreach_match(sr, service, lower, upper) { first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { if (filter & TIPC_SUB_PORTS) list_add_tail(&p->list, &publ_list); else if (!first || publication_after(first, p)) /* Pick this range's *first* publication */ first = p; } if (first) list_add_tail(&first->list, &publ_list); } /* Sort the publications before reporting */ list_sort(NULL, &publ_list, tipc_publ_sort); list_for_each_entry_safe(p, tmp, &publ_list, list) { tipc_sub_report_overlap(sub, p, TIPC_PUBLISHED, true); list_del_init(&p->list); } } static struct tipc_service *tipc_service_find(struct net *net, struct tipc_uaddr *ua) { struct name_table *nt = tipc_name_table(net); struct hlist_head *service_head; struct tipc_service *service; service_head = &nt->services[hash(ua->sr.type)]; hlist_for_each_entry_rcu(service, service_head, service_list) { if (service->type == ua->sr.type) return service; } return NULL; }; struct publication *tipc_nametbl_insert_publ(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct tipc_service *sc; struct publication *p; p = tipc_publ_create(ua, sk, key); if (!p) return NULL; sc = tipc_service_find(net, ua); if (!sc) sc = tipc_service_create(net, ua); if (sc && tipc_service_insert_publ(net, sc, p)) return p; kfree(p); return NULL; } struct publication *tipc_nametbl_remove_publ(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct tipc_subscription *sub, *tmp; struct publication *p = NULL; struct service_range *sr; struct tipc_service *sc; bool last; sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); sr = tipc_service_find_range(sc, ua); if (!sr) goto unlock; p = tipc_service_remove_publ(sr, sk, key); if (!p) goto unlock; /* Notify any waiting subscriptions */ last = list_empty(&sr->all_publ); list_for_each_entry_safe(sub, tmp, &sc->subscriptions, service_list) { tipc_sub_report_overlap(sub, p, TIPC_WITHDRAWN, last); } /* Remove service range item if this was its last publication */ if (list_empty(&sr->all_publ)) { rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } /* Delete service item if no more publications and subscriptions */ if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { hlist_del_init_rcu(&sc->service_list); kfree_rcu(sc, rcu); } unlock: spin_unlock_bh(&sc->lock); exit: if (!p) { pr_err("Failed to remove unknown binding: %u,%u,%u/%u:%u/%u\n", ua->sr.type, ua->sr.lower, ua->sr.upper, sk->node, sk->ref, key); } return p; } /** * tipc_nametbl_lookup_anycast - perform service instance to socket translation * @net: network namespace * @ua: service address to look up * @sk: address to socket we want to find * * On entry, a non-zero 'sk->node' indicates the node where we want lookup to be * performed, which may not be this one. * * On exit: * * - If lookup is deferred to another node, leave 'sk->node' unchanged and * return 'true'. * - If lookup is successful, set the 'sk->node' and 'sk->ref' (== portid) which * represent the bound socket and return 'true'. * - If lookup fails, return 'false' * * Note that for legacy users (node configured with Z.C.N address format) the * 'closest-first' lookup algorithm must be maintained, i.e., if sk.node is 0 * we must look in the local binding list first */ bool tipc_nametbl_lookup_anycast(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk) { struct tipc_net *tn = tipc_net(net); bool legacy = tn->legacy_addr_format; u32 self = tipc_own_addr(net); u32 inst = ua->sa.instance; struct service_range *r; struct tipc_service *sc; struct publication *p; struct list_head *l; bool res = false; if (!tipc_in_scope(legacy, sk->node, self)) return true; rcu_read_lock(); sc = tipc_service_find(net, ua); if (unlikely(!sc)) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(r, sc, inst, inst) { /* Select lookup algo: local, closest-first or round-robin */ if (sk->node == self) { l = &r->local_publ; if (list_empty(l)) continue; p = list_first_entry(l, struct publication, local_publ); list_move_tail(&p->local_publ, &r->local_publ); } else if (legacy && !sk->node && !list_empty(&r->local_publ)) { l = &r->local_publ; p = list_first_entry(l, struct publication, local_publ); list_move_tail(&p->local_publ, &r->local_publ); } else { l = &r->all_publ; p = list_first_entry(l, struct publication, all_publ); list_move_tail(&p->all_publ, &r->all_publ); } *sk = p->sk; res = true; /* Todo: as for legacy, pick the first matching range only, a * "true" round-robin will be performed as needed. */ break; } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); return res; } /* tipc_nametbl_lookup_group(): lookup destinaton(s) in a communication group * Returns a list of one (== group anycast) or more (== group multicast) * destination socket/node pairs matching the given address. * The requester may or may not want to exclude himself from the list. */ bool tipc_nametbl_lookup_group(struct net *net, struct tipc_uaddr *ua, struct list_head *dsts, int *dstcnt, u32 exclude, bool mcast) { u32 self = tipc_own_addr(net); u32 inst = ua->sa.instance; struct service_range *sr; struct tipc_service *sc; struct publication *p; *dstcnt = 0; rcu_read_lock(); sc = tipc_service_find(net, ua); if (unlikely(!sc)) goto exit; spin_lock_bh(&sc->lock); /* Todo: a full search i.e. service_range_foreach_match() instead? */ sr = service_range_match_first(sc->ranges.rb_node, inst, inst); if (!sr) goto no_match; list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->scope != ua->scope) continue; if (p->sk.ref == exclude && p->sk.node == self) continue; tipc_dest_push(dsts, p->sk.node, p->sk.ref); (*dstcnt)++; if (mcast) continue; list_move_tail(&p->all_publ, &sr->all_publ); break; } no_match: spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); return !list_empty(dsts); } /* tipc_nametbl_lookup_mcast_sockets(): look up node local destinaton sockets * matching the given address * Used on nodes which have received a multicast/broadcast message * Returns a list of local sockets */ void tipc_nametbl_lookup_mcast_sockets(struct net *net, struct tipc_uaddr *ua, struct list_head *dports) { struct service_range *sr; struct tipc_service *sc; struct publication *p; u8 scope = ua->scope; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->local_publ, local_publ) { if (scope == p->scope || scope == TIPC_ANY_SCOPE) tipc_dest_push(dports, 0, p->sk.ref); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_lookup_mcast_nodes(): look up all destination nodes matching * the given address. Used in sending node. * Used on nodes which are sending out a multicast/broadcast message * Returns a list of nodes, including own node if applicable */ void tipc_nametbl_lookup_mcast_nodes(struct net *net, struct tipc_uaddr *ua, struct tipc_nlist *nodes) { struct service_range *sr; struct tipc_service *sc; struct publication *p; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); service_range_foreach_match(sr, sc, ua->sr.lower, ua->sr.upper) { list_for_each_entry(p, &sr->all_publ, all_publ) { tipc_nlist_add(nodes, p->sk.node); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_build_group - build list of communication group members */ void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, struct tipc_uaddr *ua) { struct service_range *sr; struct tipc_service *sc; struct publication *p; struct rb_node *n; rcu_read_lock(); sc = tipc_service_find(net, ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); list_for_each_entry(p, &sr->all_publ, all_publ) { if (p->scope != ua->scope) continue; tipc_group_add_member(grp, p->sk.node, p->sk.ref, p->sr.lower); } } spin_unlock_bh(&sc->lock); exit: rcu_read_unlock(); } /* tipc_nametbl_publish - add service binding to name table */ struct publication *tipc_nametbl_publish(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct publication *p = NULL; struct sk_buff *skb = NULL; u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); if (nt->local_publ_count >= TIPC_MAX_PUBL) { pr_warn("Bind failed, max limit %u reached\n", TIPC_MAX_PUBL); goto exit; } p = tipc_nametbl_insert_publ(net, ua, sk, key); if (p) { nt->local_publ_count++; skb = tipc_named_publish(net, p); } rc_dests = nt->rc_dests; exit: spin_unlock_bh(&tn->nametbl_lock); if (skb) tipc_node_broadcast(net, skb, rc_dests); return p; } /** * tipc_nametbl_withdraw - withdraw a service binding * @net: network namespace * @ua: service address/range being unbound * @sk: address of the socket being unbound from * @key: target publication key */ void tipc_nametbl_withdraw(struct net *net, struct tipc_uaddr *ua, struct tipc_socket_addr *sk, u32 key) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct sk_buff *skb = NULL; struct publication *p; u32 rc_dests; spin_lock_bh(&tn->nametbl_lock); p = tipc_nametbl_remove_publ(net, ua, sk, key); if (p) { nt->local_publ_count--; skb = tipc_named_withdraw(net, p); list_del_init(&p->binding_sock); kfree_rcu(p, rcu); } rc_dests = nt->rc_dests; spin_unlock_bh(&tn->nametbl_lock); if (skb) tipc_node_broadcast(net, skb, rc_dests); } /** * tipc_nametbl_subscribe - add a subscription object to the name table * @sub: subscription to add */ bool tipc_nametbl_subscribe(struct tipc_subscription *sub) { struct tipc_net *tn = tipc_net(sub->net); u32 type = sub->s.seq.type; struct tipc_service *sc; struct tipc_uaddr ua; bool res = true; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, type, sub->s.seq.lower, sub->s.seq.upper); spin_lock_bh(&tn->nametbl_lock); sc = tipc_service_find(sub->net, &ua); if (!sc) sc = tipc_service_create(sub->net, &ua); if (sc) { spin_lock_bh(&sc->lock); tipc_service_subscribe(sc, sub); spin_unlock_bh(&sc->lock); } else { pr_warn("Failed to subscribe for {%u,%u,%u}\n", type, sub->s.seq.lower, sub->s.seq.upper); res = false; } spin_unlock_bh(&tn->nametbl_lock); return res; } /** * tipc_nametbl_unsubscribe - remove a subscription object from name table * @sub: subscription to remove */ void tipc_nametbl_unsubscribe(struct tipc_subscription *sub) { struct tipc_net *tn = tipc_net(sub->net); struct tipc_service *sc; struct tipc_uaddr ua; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, sub->s.seq.type, sub->s.seq.lower, sub->s.seq.upper); spin_lock_bh(&tn->nametbl_lock); sc = tipc_service_find(sub->net, &ua); if (!sc) goto exit; spin_lock_bh(&sc->lock); list_del_init(&sub->service_list); tipc_sub_put(sub); /* Delete service item if no more publications and subscriptions */ if (RB_EMPTY_ROOT(&sc->ranges) && list_empty(&sc->subscriptions)) { hlist_del_init_rcu(&sc->service_list); kfree_rcu(sc, rcu); } spin_unlock_bh(&sc->lock); exit: spin_unlock_bh(&tn->nametbl_lock); } int tipc_nametbl_init(struct net *net) { struct tipc_net *tn = tipc_net(net); struct name_table *nt; int i; nt = kzalloc(sizeof(*nt), GFP_KERNEL); if (!nt) return -ENOMEM; for (i = 0; i < TIPC_NAMETBL_SIZE; i++) INIT_HLIST_HEAD(&nt->services[i]); INIT_LIST_HEAD(&nt->node_scope); INIT_LIST_HEAD(&nt->cluster_scope); rwlock_init(&nt->cluster_scope_lock); tn->nametbl = nt; spin_lock_init(&tn->nametbl_lock); return 0; } /** * tipc_service_delete - purge all publications for a service and delete it * @net: the associated network namespace * @sc: tipc_service to delete */ static void tipc_service_delete(struct net *net, struct tipc_service *sc) { struct service_range *sr, *tmpr; struct publication *p, *tmp; spin_lock_bh(&sc->lock); rbtree_postorder_for_each_entry_safe(sr, tmpr, &sc->ranges, tree_node) { list_for_each_entry_safe(p, tmp, &sr->all_publ, all_publ) { tipc_service_remove_publ(sr, &p->sk, p->key); kfree_rcu(p, rcu); } rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } hlist_del_init_rcu(&sc->service_list); spin_unlock_bh(&sc->lock); kfree_rcu(sc, rcu); } void tipc_nametbl_stop(struct net *net) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct hlist_head *service_head; struct tipc_service *service; u32 i; /* Verify name table is empty and purge any lingering * publications, then release the name table */ spin_lock_bh(&tn->nametbl_lock); for (i = 0; i < TIPC_NAMETBL_SIZE; i++) { if (hlist_empty(&nt->services[i])) continue; service_head = &nt->services[i]; hlist_for_each_entry_rcu(service, service_head, service_list) { tipc_service_delete(net, service); } } spin_unlock_bh(&tn->nametbl_lock); /* TODO: clear tn->nametbl, implement proper RCU rules ? */ kfree_rcu(nt, rcu); } static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, struct tipc_service *service, struct service_range *sr, u32 *last_key) { struct publication *p; struct nlattr *attrs; struct nlattr *b; void *hdr; if (*last_key) { list_for_each_entry(p, &sr->all_publ, all_publ) if (p->key == *last_key) break; if (list_entry_is_head(p, &sr->all_publ, all_publ)) return -EPIPE; } else { p = list_first_entry(&sr->all_publ, struct publication, all_publ); } list_for_each_entry_from(p, &sr->all_publ, all_publ) { *last_key = p->key; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NAME_TABLE_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE); if (!attrs) goto msg_full; b = nla_nest_start_noflag(msg->skb, TIPC_NLA_NAME_TABLE_PUBL); if (!b) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_TYPE, service->type)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_LOWER, sr->lower)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_UPPER, sr->upper)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_SCOPE, p->scope)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_NODE, p->sk.node)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_REF, p->sk.ref)) goto publ_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PUBL_KEY, p->key)) goto publ_msg_full; nla_nest_end(msg->skb, b); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); } *last_key = 0; return 0; publ_msg_full: nla_nest_cancel(msg->skb, b); attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } static int __tipc_nl_service_range_list(struct tipc_nl_msg *msg, struct tipc_service *sc, u32 *last_lower, u32 *last_key) { struct service_range *sr; struct rb_node *n; int err; for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); if (sr->lower < *last_lower) continue; err = __tipc_nl_add_nametable_publ(msg, sc, sr, last_key); if (err) { *last_lower = sr->lower; return err; } } *last_lower = 0; return 0; } static int tipc_nl_service_list(struct net *net, struct tipc_nl_msg *msg, u32 *last_type, u32 *last_lower, u32 *last_key) { struct tipc_net *tn = tipc_net(net); struct tipc_service *service = NULL; struct hlist_head *head; struct tipc_uaddr ua; int err; int i; if (*last_type) i = hash(*last_type); else i = 0; for (; i < TIPC_NAMETBL_SIZE; i++) { head = &tn->nametbl->services[i]; if (*last_type || (!i && *last_key && (*last_lower == *last_key))) { tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, *last_type, *last_lower, *last_lower); service = tipc_service_find(net, &ua); if (!service) return -EPIPE; } else { hlist_for_each_entry_rcu(service, head, service_list) break; if (!service) continue; } hlist_for_each_entry_from_rcu(service, service_list) { spin_lock_bh(&service->lock); err = __tipc_nl_service_range_list(msg, service, last_lower, last_key); if (err) { *last_type = service->type; spin_unlock_bh(&service->lock); return err; } spin_unlock_bh(&service->lock); } *last_type = 0; } return 0; } int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 last_type = cb->args[0]; u32 last_lower = cb->args[1]; u32 last_key = cb->args[2]; int done = cb->args[3]; struct tipc_nl_msg msg; int err; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); err = tipc_nl_service_list(net, &msg, &last_type, &last_lower, &last_key); if (!err) { done = 1; } else if (err != -EMSGSIZE) { /* We never set seq or call nl_dump_check_consistent() this * means that setting prev_seq here will cause the consistence * check to fail in the netlink callback handler. Resulting in * the NLMSG_DONE message having the NLM_F_DUMP_INTR flag set if * we got an error. */ cb->prev_seq = 1; } rcu_read_unlock(); cb->args[0] = last_type; cb->args[1] = last_lower; cb->args[2] = last_key; cb->args[3] = done; return skb->len; } struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; list_for_each_entry(dst, l, list) { if (dst->node == node && dst->port == port) return dst; } return NULL; } bool tipc_dest_push(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; if (tipc_dest_find(l, node, port)) return false; dst = kmalloc(sizeof(*dst), GFP_ATOMIC); if (unlikely(!dst)) return false; dst->node = node; dst->port = port; list_add(&dst->list, l); return true; } bool tipc_dest_pop(struct list_head *l, u32 *node, u32 *port) { struct tipc_dest *dst; if (list_empty(l)) return false; dst = list_first_entry(l, typeof(*dst), list); if (port) *port = dst->port; if (node) *node = dst->node; list_del(&dst->list); kfree(dst); return true; } bool tipc_dest_del(struct list_head *l, u32 node, u32 port) { struct tipc_dest *dst; dst = tipc_dest_find(l, node, port); if (!dst) return false; list_del(&dst->list); kfree(dst); return true; } void tipc_dest_list_purge(struct list_head *l) { struct tipc_dest *dst, *tmp; list_for_each_entry_safe(dst, tmp, l, list) { list_del(&dst->list); kfree(dst); } } |
1752 1364 2 1749 310 1748 162 1750 434 1725 1723 1724 1681 2 100 1 2 1724 1721 1725 1723 1724 1724 1962 260 1750 1962 1962 1681 258 1752 1725 1724 1725 1723 1962 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 IBM Corporation * Author: Mimi Zohar <zohar@us.ibm.com> * * ima_policy.c * - initialize default measure policy rules */ #include <linux/init.h> #include <linux/list.h> #include <linux/kernel_read_file.h> #include <linux/fs.h> #include <linux/security.h> #include <linux/magic.h> #include <linux/parser.h> #include <linux/slab.h> #include <linux/rculist.h> #include <linux/seq_file.h> #include <linux/ima.h> #include "ima.h" /* flags definitions */ #define IMA_FUNC 0x0001 #define IMA_MASK 0x0002 #define IMA_FSMAGIC 0x0004 #define IMA_UID 0x0008 #define IMA_FOWNER 0x0010 #define IMA_FSUUID 0x0020 #define IMA_INMASK 0x0040 #define IMA_EUID 0x0080 #define IMA_PCR 0x0100 #define IMA_FSNAME 0x0200 #define IMA_KEYRINGS 0x0400 #define IMA_LABEL 0x0800 #define IMA_VALIDATE_ALGOS 0x1000 #define IMA_GID 0x2000 #define IMA_EGID 0x4000 #define IMA_FGROUP 0x8000 #define UNKNOWN 0 #define MEASURE 0x0001 /* same as IMA_MEASURE */ #define DONT_MEASURE 0x0002 #define APPRAISE 0x0004 /* same as IMA_APPRAISE */ #define DONT_APPRAISE 0x0008 #define AUDIT 0x0040 #define HASH 0x0100 #define DONT_HASH 0x0200 #define INVALID_PCR(a) (((a) < 0) || \ (a) >= (sizeof_field(struct ima_iint_cache, measured_pcrs) * 8)) int ima_policy_flag; static int temp_ima_appraise; static int build_ima_appraise __ro_after_init; atomic_t ima_setxattr_allowed_hash_algorithms; #define MAX_LSM_RULES 6 enum lsm_rule_types { LSM_OBJ_USER, LSM_OBJ_ROLE, LSM_OBJ_TYPE, LSM_SUBJ_USER, LSM_SUBJ_ROLE, LSM_SUBJ_TYPE }; enum policy_types { ORIGINAL_TCB = 1, DEFAULT_TCB }; enum policy_rule_list { IMA_DEFAULT_POLICY = 1, IMA_CUSTOM_POLICY }; struct ima_rule_opt_list { size_t count; char *items[] __counted_by(count); }; /* * These comparators are needed nowhere outside of ima so just define them here. * This pattern should hopefully never be needed outside of ima. */ static inline bool vfsuid_gt_kuid(vfsuid_t vfsuid, kuid_t kuid) { return __vfsuid_val(vfsuid) > __kuid_val(kuid); } static inline bool vfsgid_gt_kgid(vfsgid_t vfsgid, kgid_t kgid) { return __vfsgid_val(vfsgid) > __kgid_val(kgid); } static inline bool vfsuid_lt_kuid(vfsuid_t vfsuid, kuid_t kuid) { return __vfsuid_val(vfsuid) < __kuid_val(kuid); } static inline bool vfsgid_lt_kgid(vfsgid_t vfsgid, kgid_t kgid) { return __vfsgid_val(vfsgid) < __kgid_val(kgid); } struct ima_rule_entry { struct list_head list; int action; unsigned int flags; enum ima_hooks func; int mask; unsigned long fsmagic; uuid_t fsuuid; kuid_t uid; kgid_t gid; kuid_t fowner; kgid_t fgroup; bool (*uid_op)(kuid_t cred_uid, kuid_t rule_uid); /* Handlers for operators */ bool (*gid_op)(kgid_t cred_gid, kgid_t rule_gid); bool (*fowner_op)(vfsuid_t vfsuid, kuid_t rule_uid); /* vfsuid_eq_kuid(), vfsuid_gt_kuid(), vfsuid_lt_kuid() */ bool (*fgroup_op)(vfsgid_t vfsgid, kgid_t rule_gid); /* vfsgid_eq_kgid(), vfsgid_gt_kgid(), vfsgid_lt_kgid() */ int pcr; unsigned int allowed_algos; /* bitfield of allowed hash algorithms */ struct { void *rule; /* LSM file metadata specific */ char *args_p; /* audit value */ int type; /* audit type */ } lsm[MAX_LSM_RULES]; char *fsname; struct ima_rule_opt_list *keyrings; /* Measure keys added to these keyrings */ struct ima_rule_opt_list *label; /* Measure data grouped under this label */ struct ima_template_desc *template; }; /* * sanity check in case the kernels gains more hash algorithms that can * fit in an unsigned int */ static_assert( 8 * sizeof(unsigned int) >= HASH_ALGO__LAST, "The bitfield allowed_algos in ima_rule_entry is too small to contain all the supported hash algorithms, consider using a bigger type"); /* * Without LSM specific knowledge, the default policy can only be * written in terms of .action, .func, .mask, .fsmagic, .uid, .gid, * .fowner, and .fgroup */ /* * The minimum rule set to allow for full TCB coverage. Measures all files * opened or mmap for exec and everything read by root. Dangerous because * normal users can easily run the machine out of memory simply building * and running executables. */ static struct ima_rule_entry dont_measure_rules[] __ro_after_init = { {.action = DONT_MEASURE, .fsmagic = PROC_SUPER_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = SYSFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = DEBUGFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = TMPFS_MAGIC, .func = FILE_CHECK, .flags = IMA_FSMAGIC | IMA_FUNC}, {.action = DONT_MEASURE, .fsmagic = DEVPTS_SUPER_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = BINFMTFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = SECURITYFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = SELINUX_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = SMACK_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = CGROUP_SUPER_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = CGROUP2_SUPER_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = NSFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = EFIVARFS_MAGIC, .flags = IMA_FSMAGIC} }; static struct ima_rule_entry original_measurement_rules[] __ro_after_init = { {.action = MEASURE, .func = MMAP_CHECK, .mask = MAY_EXEC, .flags = IMA_FUNC | IMA_MASK}, {.action = MEASURE, .func = BPRM_CHECK, .mask = MAY_EXEC, .flags = IMA_FUNC | IMA_MASK}, {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ, .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq, .flags = IMA_FUNC | IMA_MASK | IMA_UID}, {.action = MEASURE, .func = MODULE_CHECK, .flags = IMA_FUNC}, {.action = MEASURE, .func = FIRMWARE_CHECK, .flags = IMA_FUNC}, }; static struct ima_rule_entry default_measurement_rules[] __ro_after_init = { {.action = MEASURE, .func = MMAP_CHECK, .mask = MAY_EXEC, .flags = IMA_FUNC | IMA_MASK}, {.action = MEASURE, .func = BPRM_CHECK, .mask = MAY_EXEC, .flags = IMA_FUNC | IMA_MASK}, {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ, .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq, .flags = IMA_FUNC | IMA_INMASK | IMA_EUID}, {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ, .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq, .flags = IMA_FUNC | IMA_INMASK | IMA_UID}, {.action = MEASURE, .func = MODULE_CHECK, .flags = IMA_FUNC}, {.action = MEASURE, .func = FIRMWARE_CHECK, .flags = IMA_FUNC}, {.action = MEASURE, .func = POLICY_CHECK, .flags = IMA_FUNC}, }; static struct ima_rule_entry default_appraise_rules[] __ro_after_init = { {.action = DONT_APPRAISE, .fsmagic = PROC_SUPER_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = SYSFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = DEBUGFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = TMPFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = RAMFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = DEVPTS_SUPER_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = BINFMTFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = SECURITYFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = SELINUX_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = SMACK_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = NSFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = EFIVARFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = CGROUP_SUPER_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = CGROUP2_SUPER_MAGIC, .flags = IMA_FSMAGIC}, #ifdef CONFIG_IMA_WRITE_POLICY {.action = APPRAISE, .func = POLICY_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, #endif #ifndef CONFIG_IMA_APPRAISE_SIGNED_INIT {.action = APPRAISE, .fowner = GLOBAL_ROOT_UID, .fowner_op = &vfsuid_eq_kuid, .flags = IMA_FOWNER}, #else /* force signature */ {.action = APPRAISE, .fowner = GLOBAL_ROOT_UID, .fowner_op = &vfsuid_eq_kuid, .flags = IMA_FOWNER | IMA_DIGSIG_REQUIRED}, #endif }; static struct ima_rule_entry build_appraise_rules[] __ro_after_init = { #ifdef CONFIG_IMA_APPRAISE_REQUIRE_MODULE_SIGS {.action = APPRAISE, .func = MODULE_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, #endif #ifdef CONFIG_IMA_APPRAISE_REQUIRE_FIRMWARE_SIGS {.action = APPRAISE, .func = FIRMWARE_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, #endif #ifdef CONFIG_IMA_APPRAISE_REQUIRE_KEXEC_SIGS {.action = APPRAISE, .func = KEXEC_KERNEL_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, #endif #ifdef CONFIG_IMA_APPRAISE_REQUIRE_POLICY_SIGS {.action = APPRAISE, .func = POLICY_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, #endif }; static struct ima_rule_entry secure_boot_rules[] __ro_after_init = { {.action = APPRAISE, .func = MODULE_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, {.action = APPRAISE, .func = FIRMWARE_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, {.action = APPRAISE, .func = KEXEC_KERNEL_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, {.action = APPRAISE, .func = POLICY_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, }; static struct ima_rule_entry critical_data_rules[] __ro_after_init = { {.action = MEASURE, .func = CRITICAL_DATA, .flags = IMA_FUNC}, }; /* An array of architecture specific rules */ static struct ima_rule_entry *arch_policy_entry __ro_after_init; static LIST_HEAD(ima_default_rules); static LIST_HEAD(ima_policy_rules); static LIST_HEAD(ima_temp_rules); static struct list_head __rcu *ima_rules = (struct list_head __rcu *)(&ima_default_rules); static int ima_policy __initdata; static int __init default_measure_policy_setup(char *str) { if (ima_policy) return 1; ima_policy = ORIGINAL_TCB; return 1; } __setup("ima_tcb", default_measure_policy_setup); static bool ima_use_appraise_tcb __initdata; static bool ima_use_secure_boot __initdata; static bool ima_use_critical_data __initdata; static bool ima_fail_unverifiable_sigs __ro_after_init; static int __init policy_setup(char *str) { char *p; while ((p = strsep(&str, " |\n")) != NULL) { if (*p == ' ') continue; if ((strcmp(p, "tcb") == 0) && !ima_policy) ima_policy = DEFAULT_TCB; else if (strcmp(p, "appraise_tcb") == 0) ima_use_appraise_tcb = true; else if (strcmp(p, "secure_boot") == 0) ima_use_secure_boot = true; else if (strcmp(p, "critical_data") == 0) ima_use_critical_data = true; else if (strcmp(p, "fail_securely") == 0) ima_fail_unverifiable_sigs = true; else pr_err("policy \"%s\" not found", p); } return 1; } __setup("ima_policy=", policy_setup); static int __init default_appraise_policy_setup(char *str) { ima_use_appraise_tcb = true; return 1; } __setup("ima_appraise_tcb", default_appraise_policy_setup); static struct ima_rule_opt_list *ima_alloc_rule_opt_list(const substring_t *src) { struct ima_rule_opt_list *opt_list; size_t count = 0; char *src_copy; char *cur, *next; size_t i; src_copy = match_strdup(src); if (!src_copy) return ERR_PTR(-ENOMEM); next = src_copy; while ((cur = strsep(&next, "|"))) { /* Don't accept an empty list item */ if (!(*cur)) { kfree(src_copy); return ERR_PTR(-EINVAL); } count++; } /* Don't accept an empty list */ if (!count) { kfree(src_copy); return ERR_PTR(-EINVAL); } opt_list = kzalloc(struct_size(opt_list, items, count), GFP_KERNEL); if (!opt_list) { kfree(src_copy); return ERR_PTR(-ENOMEM); } opt_list->count = count; /* * strsep() has already replaced all instances of '|' with '\0', * leaving a byte sequence of NUL-terminated strings. Reference each * string with the array of items. * * IMPORTANT: Ownership of the allocated buffer is transferred from * src_copy to the first element in the items array. To free the * buffer, kfree() must only be called on the first element of the * array. */ for (i = 0, cur = src_copy; i < count; i++) { opt_list->items[i] = cur; cur = strchr(cur, '\0') + 1; } return opt_list; } static void ima_free_rule_opt_list(struct ima_rule_opt_list *opt_list) { if (!opt_list) return; if (opt_list->count) { kfree(opt_list->items[0]); opt_list->count = 0; } kfree(opt_list); } static void ima_lsm_free_rule(struct ima_rule_entry *entry) { int i; for (i = 0; i < MAX_LSM_RULES; i++) { ima_filter_rule_free(entry->lsm[i].rule); kfree(entry->lsm[i].args_p); } } static void ima_free_rule(struct ima_rule_entry *entry) { if (!entry) return; /* * entry->template->fields may be allocated in ima_parse_rule() but that * reference is owned by the corresponding ima_template_desc element in * the defined_templates list and cannot be freed here */ kfree(entry->fsname); ima_free_rule_opt_list(entry->keyrings); ima_lsm_free_rule(entry); kfree(entry); } static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry, gfp_t gfp) { struct ima_rule_entry *nentry; int i; /* * Immutable elements are copied over as pointers and data; only * lsm rules can change */ nentry = kmemdup(entry, sizeof(*nentry), gfp); if (!nentry) return NULL; memset(nentry->lsm, 0, sizeof_field(struct ima_rule_entry, lsm)); for (i = 0; i < MAX_LSM_RULES; i++) { if (!entry->lsm[i].args_p) continue; nentry->lsm[i].type = entry->lsm[i].type; nentry->lsm[i].args_p = entry->lsm[i].args_p; ima_filter_rule_init(nentry->lsm[i].type, Audit_equal, nentry->lsm[i].args_p, &nentry->lsm[i].rule, gfp); if (!nentry->lsm[i].rule) pr_warn("rule for LSM \'%s\' is undefined\n", nentry->lsm[i].args_p); } return nentry; } static int ima_lsm_update_rule(struct ima_rule_entry *entry) { int i; struct ima_rule_entry *nentry; nentry = ima_lsm_copy_rule(entry, GFP_KERNEL); if (!nentry) return -ENOMEM; list_replace_rcu(&entry->list, &nentry->list); synchronize_rcu(); /* * ima_lsm_copy_rule() shallow copied all references, except for the * LSM references, from entry to nentry so we only want to free the LSM * references and the entry itself. All other memory references will now * be owned by nentry. */ for (i = 0; i < MAX_LSM_RULES; i++) ima_filter_rule_free(entry->lsm[i].rule); kfree(entry); return 0; } static bool ima_rule_contains_lsm_cond(struct ima_rule_entry *entry) { int i; for (i = 0; i < MAX_LSM_RULES; i++) if (entry->lsm[i].args_p) return true; return false; } /* * The LSM policy can be reloaded, leaving the IMA LSM based rules referring * to the old, stale LSM policy. Update the IMA LSM based rules to reflect * the reloaded LSM policy. */ static void ima_lsm_update_rules(void) { struct ima_rule_entry *entry, *e; int result; list_for_each_entry_safe(entry, e, &ima_policy_rules, list) { if (!ima_rule_contains_lsm_cond(entry)) continue; result = ima_lsm_update_rule(entry); if (result) { pr_err("lsm rule update error %d\n", result); return; } } } int ima_lsm_policy_change(struct notifier_block *nb, unsigned long event, void *lsm_data) { if (event != LSM_POLICY_CHANGE) return NOTIFY_DONE; ima_lsm_update_rules(); return NOTIFY_OK; } /** * ima_match_rule_data - determine whether func_data matches the policy rule * @rule: a pointer to a rule * @func_data: data to match against the measure rule data * @cred: a pointer to a credentials structure for user validation * * Returns true if func_data matches one in the rule, false otherwise. */ static bool ima_match_rule_data(struct ima_rule_entry *rule, const char *func_data, const struct cred *cred) { const struct ima_rule_opt_list *opt_list = NULL; bool matched = false; size_t i; if ((rule->flags & IMA_UID) && !rule->uid_op(cred->uid, rule->uid)) return false; switch (rule->func) { case KEY_CHECK: if (!rule->keyrings) return true; opt_list = rule->keyrings; break; case CRITICAL_DATA: if (!rule->label) return true; opt_list = rule->label; break; default: return false; } if (!func_data) return false; for (i = 0; i < opt_list->count; i++) { if (!strcmp(opt_list->items[i], func_data)) { matched = true; break; } } return matched; } /** * ima_match_rules - determine whether an inode matches the policy rule. * @rule: a pointer to a rule * @idmap: idmap of the mount the inode was found from * @inode: a pointer to an inode * @cred: a pointer to a credentials structure for user validation * @prop: LSM properties of the task to be validated * @func: LIM hook identifier * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC) * @func_data: func specific data, may be NULL * * Returns true on rule match, false on failure. */ static bool ima_match_rules(struct ima_rule_entry *rule, struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, struct lsm_prop *prop, enum ima_hooks func, int mask, const char *func_data) { int i; bool result = false; struct ima_rule_entry *lsm_rule = rule; bool rule_reinitialized = false; if ((rule->flags & IMA_FUNC) && (rule->func != func && func != POST_SETATTR)) return false; switch (func) { case KEY_CHECK: case CRITICAL_DATA: return ((rule->func == func) && ima_match_rule_data(rule, func_data, cred)); default: break; } if ((rule->flags & IMA_MASK) && (rule->mask != mask && func != POST_SETATTR)) return false; if ((rule->flags & IMA_INMASK) && (!(rule->mask & mask) && func != POST_SETATTR)) return false; if ((rule->flags & IMA_FSMAGIC) && rule->fsmagic != inode->i_sb->s_magic) return false; if ((rule->flags & IMA_FSNAME) && strcmp(rule->fsname, inode->i_sb->s_type->name)) return false; if ((rule->flags & IMA_FSUUID) && !uuid_equal(&rule->fsuuid, &inode->i_sb->s_uuid)) return false; if ((rule->flags & IMA_UID) && !rule->uid_op(cred->uid, rule->uid)) return false; if (rule->flags & IMA_EUID) { if (has_capability_noaudit(current, CAP_SETUID)) { if (!rule->uid_op(cred->euid, rule->uid) && !rule->uid_op(cred->suid, rule->uid) && !rule->uid_op(cred->uid, rule->uid)) return false; } else if (!rule->uid_op(cred->euid, rule->uid)) return false; } if ((rule->flags & IMA_GID) && !rule->gid_op(cred->gid, rule->gid)) return false; if (rule->flags & IMA_EGID) { if (has_capability_noaudit(current, CAP_SETGID)) { if (!rule->gid_op(cred->egid, rule->gid) && !rule->gid_op(cred->sgid, rule->gid) && !rule->gid_op(cred->gid, rule->gid)) return false; } else if (!rule->gid_op(cred->egid, rule->gid)) return false; } if ((rule->flags & IMA_FOWNER) && !rule->fowner_op(i_uid_into_vfsuid(idmap, inode), rule->fowner)) return false; if ((rule->flags & IMA_FGROUP) && !rule->fgroup_op(i_gid_into_vfsgid(idmap, inode), rule->fgroup)) return false; for (i = 0; i < MAX_LSM_RULES; i++) { int rc = 0; struct lsm_prop inode_prop = { }; if (!lsm_rule->lsm[i].rule) { if (!lsm_rule->lsm[i].args_p) continue; else return false; } retry: switch (i) { case LSM_OBJ_USER: case LSM_OBJ_ROLE: case LSM_OBJ_TYPE: security_inode_getlsmprop(inode, &inode_prop); rc = ima_filter_rule_match(&inode_prop, lsm_rule->lsm[i].type, Audit_equal, lsm_rule->lsm[i].rule); break; case LSM_SUBJ_USER: case LSM_SUBJ_ROLE: case LSM_SUBJ_TYPE: rc = ima_filter_rule_match(prop, lsm_rule->lsm[i].type, Audit_equal, lsm_rule->lsm[i].rule); break; default: break; } if (rc == -ESTALE && !rule_reinitialized) { lsm_rule = ima_lsm_copy_rule(rule, GFP_ATOMIC); if (lsm_rule) { rule_reinitialized = true; goto retry; } } if (!rc) { result = false; goto out; } } result = true; out: if (rule_reinitialized) { for (i = 0; i < MAX_LSM_RULES; i++) ima_filter_rule_free(lsm_rule->lsm[i].rule); kfree(lsm_rule); } return result; } /* * In addition to knowing that we need to appraise the file in general, * we need to differentiate between calling hooks, for hook specific rules. */ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func) { if (!(rule->flags & IMA_FUNC)) return IMA_FILE_APPRAISE; switch (func) { case MMAP_CHECK: case MMAP_CHECK_REQPROT: return IMA_MMAP_APPRAISE; case BPRM_CHECK: return IMA_BPRM_APPRAISE; case CREDS_CHECK: return IMA_CREDS_APPRAISE; case FILE_CHECK: case POST_SETATTR: return IMA_FILE_APPRAISE; case MODULE_CHECK ... MAX_CHECK - 1: default: return IMA_READ_APPRAISE; } } /** * ima_match_policy - decision based on LSM and other conditions * @idmap: idmap of the mount the inode was found from * @inode: pointer to an inode for which the policy decision is being made * @cred: pointer to a credentials structure for which the policy decision is * being made * @prop: LSM properties of the task to be validated * @func: IMA hook identifier * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC) * @flags: IMA actions to consider (e.g. IMA_MEASURE | IMA_APPRAISE) * @pcr: set the pcr to extend * @template_desc: the template that should be used for this rule * @func_data: func specific data, may be NULL * @allowed_algos: allowlist of hash algorithms for the IMA xattr * * Measure decision based on func/mask/fsmagic and LSM(subj/obj/type) * conditions. * * Since the IMA policy may be updated multiple times we need to lock the * list when walking it. Reads are many orders of magnitude more numerous * than writes so ima_match_policy() is classical RCU candidate. */ int ima_match_policy(struct mnt_idmap *idmap, struct inode *inode, const struct cred *cred, struct lsm_prop *prop, enum ima_hooks func, int mask, int flags, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos) { struct ima_rule_entry *entry; int action = 0, actmask = flags | (flags << 1); struct list_head *ima_rules_tmp; if (template_desc && !*template_desc) *template_desc = ima_template_desc_current(); rcu_read_lock(); ima_rules_tmp = rcu_dereference(ima_rules); list_for_each_entry_rcu(entry, ima_rules_tmp, list) { if (!(entry->action & actmask)) continue; if (!ima_match_rules(entry, idmap, inode, cred, prop, func, mask, func_data)) continue; action |= entry->flags & IMA_NONACTION_FLAGS; action |= entry->action & IMA_DO_MASK; if (entry->action & IMA_APPRAISE) { action |= get_subaction(entry, func); action &= ~IMA_HASH; if (ima_fail_unverifiable_sigs) action |= IMA_FAIL_UNVERIFIABLE_SIGS; if (allowed_algos && entry->flags & IMA_VALIDATE_ALGOS) *allowed_algos = entry->allowed_algos; } if (entry->action & IMA_DO_MASK) actmask &= ~(entry->action | entry->action << 1); else actmask &= ~(entry->action | entry->action >> 1); if ((pcr) && (entry->flags & IMA_PCR)) *pcr = entry->pcr; if (template_desc && entry->template) *template_desc = entry->template; if (!actmask) break; } rcu_read_unlock(); return action; } /** * ima_update_policy_flags() - Update global IMA variables * * Update ima_policy_flag and ima_setxattr_allowed_hash_algorithms * based on the currently loaded policy. * * With ima_policy_flag, the decision to short circuit out of a function * or not call the function in the first place can be made earlier. * * With ima_setxattr_allowed_hash_algorithms, the policy can restrict the * set of hash algorithms accepted when updating the security.ima xattr of * a file. * * Context: called after a policy update and at system initialization. */ void ima_update_policy_flags(void) { struct ima_rule_entry *entry; int new_policy_flag = 0; struct list_head *ima_rules_tmp; rcu_read_lock(); ima_rules_tmp = rcu_dereference(ima_rules); list_for_each_entry_rcu(entry, ima_rules_tmp, list) { /* * SETXATTR_CHECK rules do not implement a full policy check * because rule checking would probably have an important * performance impact on setxattr(). As a consequence, only one * SETXATTR_CHECK can be active at a given time. * Because we want to preserve that property, we set out to use * atomic_cmpxchg. Either: * - the atomic was non-zero: a setxattr hash policy is * already enforced, we do nothing * - the atomic was zero: no setxattr policy was set, enable * the setxattr hash policy */ if (entry->func == SETXATTR_CHECK) { atomic_cmpxchg(&ima_setxattr_allowed_hash_algorithms, 0, entry->allowed_algos); /* SETXATTR_CHECK doesn't impact ima_policy_flag */ continue; } if (entry->action & IMA_DO_MASK) new_policy_flag |= entry->action; } rcu_read_unlock(); ima_appraise |= (build_ima_appraise | temp_ima_appraise); if (!ima_appraise) new_policy_flag &= ~IMA_APPRAISE; ima_policy_flag = new_policy_flag; } static int ima_appraise_flag(enum ima_hooks func) { if (func == MODULE_CHECK) return IMA_APPRAISE_MODULES; else if (func == FIRMWARE_CHECK) return IMA_APPRAISE_FIRMWARE; else if (func == POLICY_CHECK) return IMA_APPRAISE_POLICY; else if (func == KEXEC_KERNEL_CHECK) return IMA_APPRAISE_KEXEC; return 0; } static void add_rules(struct ima_rule_entry *entries, int count, enum policy_rule_list policy_rule) { int i = 0; for (i = 0; i < count; i++) { struct ima_rule_entry *entry; if (policy_rule & IMA_DEFAULT_POLICY) list_add_tail(&entries[i].list, &ima_default_rules); if (policy_rule & IMA_CUSTOM_POLICY) { entry = kmemdup(&entries[i], sizeof(*entry), GFP_KERNEL); if (!entry) continue; list_add_tail(&entry->list, &ima_policy_rules); } if (entries[i].action == APPRAISE) { if (entries != build_appraise_rules) temp_ima_appraise |= ima_appraise_flag(entries[i].func); else build_ima_appraise |= ima_appraise_flag(entries[i].func); } } } static int ima_parse_rule(char *rule, struct ima_rule_entry *entry); static int __init ima_init_arch_policy(void) { const char * const *arch_rules; const char * const *rules; int arch_entries = 0; int i = 0; arch_rules = arch_get_ima_policy(); if (!arch_rules) return arch_entries; /* Get number of rules */ for (rules = arch_rules; *rules != NULL; rules++) arch_entries++; arch_policy_entry = kcalloc(arch_entries + 1, sizeof(*arch_policy_entry), GFP_KERNEL); if (!arch_policy_entry) return 0; /* Convert each policy string rules to struct ima_rule_entry format */ for (rules = arch_rules, i = 0; *rules != NULL; rules++) { char rule[255]; int result; result = strscpy(rule, *rules, sizeof(rule)); INIT_LIST_HEAD(&arch_policy_entry[i].list); result = ima_parse_rule(rule, &arch_policy_entry[i]); if (result) { pr_warn("Skipping unknown architecture policy rule: %s\n", rule); memset(&arch_policy_entry[i], 0, sizeof(*arch_policy_entry)); continue; } i++; } return i; } /** * ima_init_policy - initialize the default measure rules. * * ima_rules points to either the ima_default_rules or the new ima_policy_rules. */ void __init ima_init_policy(void) { int build_appraise_entries, arch_entries; /* if !ima_policy, we load NO default rules */ if (ima_policy) add_rules(dont_measure_rules, ARRAY_SIZE(dont_measure_rules), IMA_DEFAULT_POLICY); switch (ima_policy) { case ORIGINAL_TCB: add_rules(original_measurement_rules, ARRAY_SIZE(original_measurement_rules), IMA_DEFAULT_POLICY); break; case DEFAULT_TCB: add_rules(default_measurement_rules, ARRAY_SIZE(default_measurement_rules), IMA_DEFAULT_POLICY); break; default: break; } /* * Based on runtime secure boot flags, insert arch specific measurement * and appraise rules requiring file signatures for both the initial * and custom policies, prior to other appraise rules. * (Highest priority) */ arch_entries = ima_init_arch_policy(); if (!arch_entries) pr_info("No architecture policies found\n"); else add_rules(arch_policy_entry, arch_entries, IMA_DEFAULT_POLICY | IMA_CUSTOM_POLICY); /* * Insert the builtin "secure_boot" policy rules requiring file * signatures, prior to other appraise rules. */ if (ima_use_secure_boot) add_rules(secure_boot_rules, ARRAY_SIZE(secure_boot_rules), IMA_DEFAULT_POLICY); /* * Insert the build time appraise rules requiring file signatures * for both the initial and custom policies, prior to other appraise * rules. As the secure boot rules includes all of the build time * rules, include either one or the other set of rules, but not both. */ build_appraise_entries = ARRAY_SIZE(build_appraise_rules); if (build_appraise_entries) { if (ima_use_secure_boot) add_rules(build_appraise_rules, build_appraise_entries, IMA_CUSTOM_POLICY); else add_rules(build_appraise_rules, build_appraise_entries, IMA_DEFAULT_POLICY | IMA_CUSTOM_POLICY); } if (ima_use_appraise_tcb) add_rules(default_appraise_rules, ARRAY_SIZE(default_appraise_rules), IMA_DEFAULT_POLICY); if (ima_use_critical_data) add_rules(critical_data_rules, ARRAY_SIZE(critical_data_rules), IMA_DEFAULT_POLICY); atomic_set(&ima_setxattr_allowed_hash_algorithms, 0); ima_update_policy_flags(); } /* Make sure we have a valid policy, at least containing some rules. */ int ima_check_policy(void) { if (list_empty(&ima_temp_rules)) return -EINVAL; return 0; } /** * ima_update_policy - update default_rules with new measure rules * * Called on file .release to update the default rules with a complete new * policy. What we do here is to splice ima_policy_rules and ima_temp_rules so * they make a queue. The policy may be updated multiple times and this is the * RCU updater. * * Policy rules are never deleted so ima_policy_flag gets zeroed only once when * we switch from the default policy to user defined. */ void ima_update_policy(void) { struct list_head *policy = &ima_policy_rules; list_splice_tail_init_rcu(&ima_temp_rules, policy, synchronize_rcu); if (ima_rules != (struct list_head __rcu *)policy) { ima_policy_flag = 0; rcu_assign_pointer(ima_rules, policy); /* * IMA architecture specific policy rules are specified * as strings and converted to an array of ima_entry_rules * on boot. After loading a custom policy, free the * architecture specific rules stored as an array. */ kfree(arch_policy_entry); } ima_update_policy_flags(); /* Custom IMA policy has been loaded */ ima_process_queued_keys(); } /* Keep the enumeration in sync with the policy_tokens! */ enum policy_opt { Opt_measure, Opt_dont_measure, Opt_appraise, Opt_dont_appraise, Opt_audit, Opt_hash, Opt_dont_hash, Opt_obj_user, Opt_obj_role, Opt_obj_type, Opt_subj_user, Opt_subj_role, Opt_subj_type, Opt_func, Opt_mask, Opt_fsmagic, Opt_fsname, Opt_fsuuid, Opt_uid_eq, Opt_euid_eq, Opt_gid_eq, Opt_egid_eq, Opt_fowner_eq, Opt_fgroup_eq, Opt_uid_gt, Opt_euid_gt, Opt_gid_gt, Opt_egid_gt, Opt_fowner_gt, Opt_fgroup_gt, Opt_uid_lt, Opt_euid_lt, Opt_gid_lt, Opt_egid_lt, Opt_fowner_lt, Opt_fgroup_lt, Opt_digest_type, Opt_appraise_type, Opt_appraise_flag, Opt_appraise_algos, Opt_permit_directio, Opt_pcr, Opt_template, Opt_keyrings, Opt_label, Opt_err }; static const match_table_t policy_tokens = { {Opt_measure, "measure"}, {Opt_dont_measure, "dont_measure"}, {Opt_appraise, "appraise"}, {Opt_dont_appraise, "dont_appraise"}, {Opt_audit, "audit"}, {Opt_hash, "hash"}, {Opt_dont_hash, "dont_hash"}, {Opt_obj_user, "obj_user=%s"}, {Opt_obj_role, "obj_role=%s"}, {Opt_obj_type, "obj_type=%s"}, {Opt_subj_user, "subj_user=%s"}, {Opt_subj_role, "subj_role=%s"}, {Opt_subj_type, "subj_type=%s"}, {Opt_func, "func=%s"}, {Opt_mask, "mask=%s"}, {Opt_fsmagic, "fsmagic=%s"}, {Opt_fsname, "fsname=%s"}, {Opt_fsuuid, "fsuuid=%s"}, {Opt_uid_eq, "uid=%s"}, {Opt_euid_eq, "euid=%s"}, {Opt_gid_eq, "gid=%s"}, {Opt_egid_eq, "egid=%s"}, {Opt_fowner_eq, "fowner=%s"}, {Opt_fgroup_eq, "fgroup=%s"}, {Opt_uid_gt, "uid>%s"}, {Opt_euid_gt, "euid>%s"}, {Opt_gid_gt, "gid>%s"}, {Opt_egid_gt, "egid>%s"}, {Opt_fowner_gt, "fowner>%s"}, {Opt_fgroup_gt, "fgroup>%s"}, {Opt_uid_lt, "uid<%s"}, {Opt_euid_lt, "euid<%s"}, {Opt_gid_lt, "gid<%s"}, {Opt_egid_lt, "egid<%s"}, {Opt_fowner_lt, "fowner<%s"}, {Opt_fgroup_lt, "fgroup<%s"}, {Opt_digest_type, "digest_type=%s"}, {Opt_appraise_type, "appraise_type=%s"}, {Opt_appraise_flag, "appraise_flag=%s"}, {Opt_appraise_algos, "appraise_algos=%s"}, {Opt_permit_directio, "permit_directio"}, {Opt_pcr, "pcr=%s"}, {Opt_template, "template=%s"}, {Opt_keyrings, "keyrings=%s"}, {Opt_label, "label=%s"}, {Opt_err, NULL} }; static int ima_lsm_rule_init(struct ima_rule_entry *entry, substring_t *args, int lsm_rule, int audit_type) { int result; if (entry->lsm[lsm_rule].rule) return -EINVAL; entry->lsm[lsm_rule].args_p = match_strdup(args); if (!entry->lsm[lsm_rule].args_p) return -ENOMEM; entry->lsm[lsm_rule].type = audit_type; result = ima_filter_rule_init(entry->lsm[lsm_rule].type, Audit_equal, entry->lsm[lsm_rule].args_p, &entry->lsm[lsm_rule].rule, GFP_KERNEL); if (!entry->lsm[lsm_rule].rule) { pr_warn("rule for LSM \'%s\' is undefined\n", entry->lsm[lsm_rule].args_p); if (ima_rules == (struct list_head __rcu *)(&ima_default_rules)) { kfree(entry->lsm[lsm_rule].args_p); entry->lsm[lsm_rule].args_p = NULL; result = -EINVAL; } else result = 0; } return result; } static void ima_log_string_op(struct audit_buffer *ab, char *key, char *value, enum policy_opt rule_operator) { if (!ab) return; switch (rule_operator) { case Opt_uid_gt: case Opt_euid_gt: case Opt_gid_gt: case Opt_egid_gt: case Opt_fowner_gt: case Opt_fgroup_gt: audit_log_format(ab, "%s>", key); break; case Opt_uid_lt: case Opt_euid_lt: case Opt_gid_lt: case Opt_egid_lt: case Opt_fowner_lt: case Opt_fgroup_lt: audit_log_format(ab, "%s<", key); break; default: audit_log_format(ab, "%s=", key); } audit_log_format(ab, "%s ", value); } static void ima_log_string(struct audit_buffer *ab, char *key, char *value) { ima_log_string_op(ab, key, value, Opt_err); } /* * Validating the appended signature included in the measurement list requires * the file hash calculated without the appended signature (i.e., the 'd-modsig' * field). Therefore, notify the user if they have the 'modsig' field but not * the 'd-modsig' field in the template. */ static void check_template_modsig(const struct ima_template_desc *template) { #define MSG "template with 'modsig' field also needs 'd-modsig' field\n" bool has_modsig, has_dmodsig; static bool checked; int i; /* We only need to notify the user once. */ if (checked) return; has_modsig = has_dmodsig = false; for (i = 0; i < template->num_fields; i++) { if (!strcmp(template->fields[i]->field_id, "modsig")) has_modsig = true; else if (!strcmp(template->fields[i]->field_id, "d-modsig")) has_dmodsig = true; } if (has_modsig && !has_dmodsig) pr_notice(MSG); checked = true; #undef MSG } /* * Warn if the template does not contain the given field. */ static void check_template_field(const struct ima_template_desc *template, const char *field, const char *msg) { int i; for (i = 0; i < template->num_fields; i++) if (!strcmp(template->fields[i]->field_id, field)) return; pr_notice_once("%s", msg); } static bool ima_validate_rule(struct ima_rule_entry *entry) { /* Ensure that the action is set and is compatible with the flags */ if (entry->action == UNKNOWN) return false; if (entry->action != MEASURE && entry->flags & IMA_PCR) return false; if (entry->action != APPRAISE && entry->flags & (IMA_DIGSIG_REQUIRED | IMA_MODSIG_ALLOWED | IMA_CHECK_BLACKLIST | IMA_VALIDATE_ALGOS)) return false; /* * The IMA_FUNC bit must be set if and only if there's a valid hook * function specified, and vice versa. Enforcing this property allows * for the NONE case below to validate a rule without an explicit hook * function. */ if (((entry->flags & IMA_FUNC) && entry->func == NONE) || (!(entry->flags & IMA_FUNC) && entry->func != NONE)) return false; /* * Ensure that the hook function is compatible with the other * components of the rule */ switch (entry->func) { case NONE: case FILE_CHECK: case MMAP_CHECK: case MMAP_CHECK_REQPROT: case BPRM_CHECK: case CREDS_CHECK: case POST_SETATTR: case FIRMWARE_CHECK: case POLICY_CHECK: if (entry->flags & ~(IMA_FUNC | IMA_MASK | IMA_FSMAGIC | IMA_UID | IMA_FOWNER | IMA_FSUUID | IMA_INMASK | IMA_EUID | IMA_PCR | IMA_FSNAME | IMA_GID | IMA_EGID | IMA_FGROUP | IMA_DIGSIG_REQUIRED | IMA_PERMIT_DIRECTIO | IMA_VALIDATE_ALGOS | IMA_CHECK_BLACKLIST | IMA_VERITY_REQUIRED)) return false; break; case MODULE_CHECK: case KEXEC_KERNEL_CHECK: case KEXEC_INITRAMFS_CHECK: if (entry->flags & ~(IMA_FUNC | IMA_MASK | IMA_FSMAGIC | IMA_UID | IMA_FOWNER | IMA_FSUUID | IMA_INMASK | IMA_EUID | IMA_PCR | IMA_FSNAME | IMA_GID | IMA_EGID | IMA_FGROUP | IMA_DIGSIG_REQUIRED | IMA_PERMIT_DIRECTIO | IMA_MODSIG_ALLOWED | IMA_CHECK_BLACKLIST | IMA_VALIDATE_ALGOS)) return false; break; case KEXEC_CMDLINE: if (entry->action & ~(MEASURE | DONT_MEASURE)) return false; if (entry->flags & ~(IMA_FUNC | IMA_FSMAGIC | IMA_UID | IMA_FOWNER | IMA_FSUUID | IMA_EUID | IMA_PCR | IMA_FSNAME | IMA_GID | IMA_EGID | IMA_FGROUP)) return false; break; case KEY_CHECK: if (entry->action & ~(MEASURE | DONT_MEASURE)) return false; if (entry->flags & ~(IMA_FUNC | IMA_UID | IMA_GID | IMA_PCR | IMA_KEYRINGS)) return false; if (ima_rule_contains_lsm_cond(entry)) return false; break; case CRITICAL_DATA: if (entry->action & ~(MEASURE | DONT_MEASURE)) return false; if (entry->flags & ~(IMA_FUNC | IMA_UID | IMA_GID | IMA_PCR | IMA_LABEL)) return false; if (ima_rule_contains_lsm_cond(entry)) return false; break; case SETXATTR_CHECK: /* any action other than APPRAISE is unsupported */ if (entry->action != APPRAISE) return false; /* SETXATTR_CHECK requires an appraise_algos parameter */ if (!(entry->flags & IMA_VALIDATE_ALGOS)) return false; /* * full policies are not supported, they would have too * much of a performance impact */ if (entry->flags & ~(IMA_FUNC | IMA_VALIDATE_ALGOS)) return false; break; default: return false; } /* Ensure that combinations of flags are compatible with each other */ if (entry->flags & IMA_CHECK_BLACKLIST && !(entry->flags & IMA_DIGSIG_REQUIRED)) return false; /* * Unlike for regular IMA 'appraise' policy rules where security.ima * xattr may contain either a file hash or signature, the security.ima * xattr for fsverity must contain a file signature (sigv3). Ensure * that 'appraise' rules for fsverity require file signatures by * checking the IMA_DIGSIG_REQUIRED flag is set. */ if (entry->action == APPRAISE && (entry->flags & IMA_VERITY_REQUIRED) && !(entry->flags & IMA_DIGSIG_REQUIRED)) return false; return true; } static unsigned int ima_parse_appraise_algos(char *arg) { unsigned int res = 0; int idx; char *token; while ((token = strsep(&arg, ",")) != NULL) { idx = match_string(hash_algo_name, HASH_ALGO__LAST, token); if (idx < 0) { pr_err("unknown hash algorithm \"%s\"", token); return 0; } if (!crypto_has_alg(hash_algo_name[idx], 0, 0)) { pr_err("unavailable hash algorithm \"%s\", check your kernel configuration", token); return 0; } /* Add the hash algorithm to the 'allowed' bitfield */ res |= (1U << idx); } return res; } static int ima_parse_rule(char *rule, struct ima_rule_entry *entry) { struct audit_buffer *ab; char *from; char *p; bool eid_token; /* either euid or egid */ struct ima_template_desc *template_desc; int result = 0; ab = integrity_audit_log_start(audit_context(), GFP_KERNEL, AUDIT_INTEGRITY_POLICY_RULE); entry->uid = INVALID_UID; entry->gid = INVALID_GID; entry->fowner = INVALID_UID; entry->fgroup = INVALID_GID; entry->uid_op = &uid_eq; entry->gid_op = &gid_eq; entry->fowner_op = &vfsuid_eq_kuid; entry->fgroup_op = &vfsgid_eq_kgid; entry->action = UNKNOWN; while ((p = strsep(&rule, " \t")) != NULL) { substring_t args[MAX_OPT_ARGS]; int token; unsigned long lnum; if (result < 0 || *p == '#') /* ignore suffixed comment */ break; if ((*p == '\0') || (*p == ' ') || (*p == '\t')) continue; token = match_token(p, policy_tokens, args); switch (token) { case Opt_measure: ima_log_string(ab, "action", "measure"); if (entry->action != UNKNOWN) result = -EINVAL; entry->action = MEASURE; break; case Opt_dont_measure: ima_log_string(ab, "action", "dont_measure"); if (entry->action != UNKNOWN) result = -EINVAL; entry->action = DONT_MEASURE; break; case Opt_appraise: ima_log_string(ab, "action", "appraise"); if (entry->action != UNKNOWN) result = -EINVAL; entry->action = APPRAISE; break; case Opt_dont_appraise: ima_log_string(ab, "action", "dont_appraise"); if (entry->action != UNKNOWN) result = -EINVAL; entry->action = DONT_APPRAISE; break; case Opt_audit: ima_log_string(ab, "action", "audit"); if (entry->action != UNKNOWN) result = -EINVAL; entry->action = AUDIT; break; case Opt_hash: ima_log_string(ab, "action", "hash"); if (entry->action != UNKNOWN) result = -EINVAL; entry->action = HASH; break; case Opt_dont_hash: ima_log_string(ab, "action", "dont_hash"); if (entry->action != UNKNOWN) result = -EINVAL; entry->action = DONT_HASH; break; case Opt_func: ima_log_string(ab, "func", args[0].from); if (entry->func) result = -EINVAL; if (strcmp(args[0].from, "FILE_CHECK") == 0) entry->func = FILE_CHECK; /* PATH_CHECK is for backwards compat */ else if (strcmp(args[0].from, "PATH_CHECK") == 0) entry->func = FILE_CHECK; else if (strcmp(args[0].from, "MODULE_CHECK") == 0) entry->func = MODULE_CHECK; else if (strcmp(args[0].from, "FIRMWARE_CHECK") == 0) entry->func = FIRMWARE_CHECK; else if ((strcmp(args[0].from, "FILE_MMAP") == 0) || (strcmp(args[0].from, "MMAP_CHECK") == 0)) entry->func = MMAP_CHECK; else if ((strcmp(args[0].from, "MMAP_CHECK_REQPROT") == 0)) entry->func = MMAP_CHECK_REQPROT; else if (strcmp(args[0].from, "BPRM_CHECK") == 0) entry->func = BPRM_CHECK; else if (strcmp(args[0].from, "CREDS_CHECK") == 0) entry->func = CREDS_CHECK; else if (strcmp(args[0].from, "KEXEC_KERNEL_CHECK") == 0) entry->func = KEXEC_KERNEL_CHECK; else if (strcmp(args[0].from, "KEXEC_INITRAMFS_CHECK") == 0) entry->func = KEXEC_INITRAMFS_CHECK; else if (strcmp(args[0].from, "POLICY_CHECK") == 0) entry->func = POLICY_CHECK; else if (strcmp(args[0].from, "KEXEC_CMDLINE") == 0) entry->func = KEXEC_CMDLINE; else if (IS_ENABLED(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) && strcmp(args[0].from, "KEY_CHECK") == 0) entry->func = KEY_CHECK; else if (strcmp(args[0].from, "CRITICAL_DATA") == 0) entry->func = CRITICAL_DATA; else if (strcmp(args[0].from, "SETXATTR_CHECK") == 0) entry->func = SETXATTR_CHECK; else result = -EINVAL; if (!result) entry->flags |= IMA_FUNC; break; case Opt_mask: ima_log_string(ab, "mask", args[0].from); if (entry->mask) result = -EINVAL; from = args[0].from; if (*from == '^') from++; if ((strcmp(from, "MAY_EXEC")) == 0) entry->mask = MAY_EXEC; else if (strcmp(from, "MAY_WRITE") == 0) entry->mask = MAY_WRITE; else if (strcmp(from, "MAY_READ") == 0) entry->mask = MAY_READ; else if (strcmp(from, "MAY_APPEND") == 0) entry->mask = MAY_APPEND; else result = -EINVAL; if (!result) entry->flags |= (*args[0].from == '^') ? IMA_INMASK : IMA_MASK; break; case Opt_fsmagic: ima_log_string(ab, "fsmagic", args[0].from); if (entry->fsmagic) { result = -EINVAL; break; } result = kstrtoul(args[0].from, 16, &entry->fsmagic); if (!result) entry->flags |= IMA_FSMAGIC; break; case Opt_fsname: ima_log_string(ab, "fsname", args[0].from); entry->fsname = kstrdup(args[0].from, GFP_KERNEL); if (!entry->fsname) { result = -ENOMEM; break; } result = 0; entry->flags |= IMA_FSNAME; break; case Opt_keyrings: ima_log_string(ab, "keyrings", args[0].from); if (!IS_ENABLED(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) || entry->keyrings) { result = -EINVAL; break; } entry->keyrings = ima_alloc_rule_opt_list(args); if (IS_ERR(entry->keyrings)) { result = PTR_ERR(entry->keyrings); entry->keyrings = NULL; break; } entry->flags |= IMA_KEYRINGS; break; case Opt_label: ima_log_string(ab, "label", args[0].from); if (entry->label) { result = -EINVAL; break; } entry->label = ima_alloc_rule_opt_list(args); if (IS_ERR(entry->label)) { result = PTR_ERR(entry->label); entry->label = NULL; break; } entry->flags |= IMA_LABEL; break; case Opt_fsuuid: ima_log_string(ab, "fsuuid", args[0].from); if (!uuid_is_null(&entry->fsuuid)) { result = -EINVAL; break; } result = uuid_parse(args[0].from, &entry->fsuuid); if (!result) entry->flags |= IMA_FSUUID; break; case Opt_uid_gt: case Opt_euid_gt: entry->uid_op = &uid_gt; fallthrough; case Opt_uid_lt: case Opt_euid_lt: if ((token == Opt_uid_lt) || (token == Opt_euid_lt)) entry->uid_op = &uid_lt; fallthrough; case Opt_uid_eq: case Opt_euid_eq: eid_token = (token == Opt_euid_eq) || (token == Opt_euid_gt) || (token == Opt_euid_lt); ima_log_string_op(ab, eid_token ? "euid" : "uid", args[0].from, token); if (uid_valid(entry->uid)) { result = -EINVAL; break; } result = kstrtoul(args[0].from, 10, &lnum); if (!result) { entry->uid = make_kuid(current_user_ns(), (uid_t) lnum); if (!uid_valid(entry->uid) || (uid_t)lnum != lnum) result = -EINVAL; else entry->flags |= eid_token ? IMA_EUID : IMA_UID; } break; case Opt_gid_gt: case Opt_egid_gt: entry->gid_op = &gid_gt; fallthrough; case Opt_gid_lt: case Opt_egid_lt: if ((token == Opt_gid_lt) || (token == Opt_egid_lt)) entry->gid_op = &gid_lt; fallthrough; case Opt_gid_eq: case Opt_egid_eq: eid_token = (token == Opt_egid_eq) || (token == Opt_egid_gt) || (token == Opt_egid_lt); ima_log_string_op(ab, eid_token ? "egid" : "gid", args[0].from, token); if (gid_valid(entry->gid)) { result = -EINVAL; break; } result = kstrtoul(args[0].from, 10, &lnum); if (!result) { entry->gid = make_kgid(current_user_ns(), (gid_t)lnum); if (!gid_valid(entry->gid) || (((gid_t)lnum) != lnum)) result = -EINVAL; else entry->flags |= eid_token ? IMA_EGID : IMA_GID; } break; case Opt_fowner_gt: entry->fowner_op = &vfsuid_gt_kuid; fallthrough; case Opt_fowner_lt: if (token == Opt_fowner_lt) entry->fowner_op = &vfsuid_lt_kuid; fallthrough; case Opt_fowner_eq: ima_log_string_op(ab, "fowner", args[0].from, token); if (uid_valid(entry->fowner)) { result = -EINVAL; break; } result = kstrtoul(args[0].from, 10, &lnum); if (!result) { entry->fowner = make_kuid(current_user_ns(), (uid_t)lnum); if (!uid_valid(entry->fowner) || (((uid_t)lnum) != lnum)) result = -EINVAL; else entry->flags |= IMA_FOWNER; } break; case Opt_fgroup_gt: entry->fgroup_op = &vfsgid_gt_kgid; fallthrough; case Opt_fgroup_lt: if (token == Opt_fgroup_lt) entry->fgroup_op = &vfsgid_lt_kgid; fallthrough; case Opt_fgroup_eq: ima_log_string_op(ab, "fgroup", args[0].from, token); if (gid_valid(entry->fgroup)) { result = -EINVAL; break; } result = kstrtoul(args[0].from, 10, &lnum); if (!result) { entry->fgroup = make_kgid(current_user_ns(), (gid_t)lnum); if (!gid_valid(entry->fgroup) || (((gid_t)lnum) != lnum)) result = -EINVAL; else entry->flags |= IMA_FGROUP; } break; case Opt_obj_user: ima_log_string(ab, "obj_user", args[0].from); result = ima_lsm_rule_init(entry, args, LSM_OBJ_USER, AUDIT_OBJ_USER); break; case Opt_obj_role: ima_log_string(ab, "obj_role", args[0].from); result = ima_lsm_rule_init(entry, args, LSM_OBJ_ROLE, AUDIT_OBJ_ROLE); break; case Opt_obj_type: ima_log_string(ab, "obj_type", args[0].from); result = ima_lsm_rule_init(entry, args, LSM_OBJ_TYPE, AUDIT_OBJ_TYPE); break; case Opt_subj_user: ima_log_string(ab, "subj_user", args[0].from); result = ima_lsm_rule_init(entry, args, LSM_SUBJ_USER, AUDIT_SUBJ_USER); break; case Opt_subj_role: ima_log_string(ab, "subj_role", args[0].from); result = ima_lsm_rule_init(entry, args, LSM_SUBJ_ROLE, AUDIT_SUBJ_ROLE); break; case Opt_subj_type: ima_log_string(ab, "subj_type", args[0].from); result = ima_lsm_rule_init(entry, args, LSM_SUBJ_TYPE, AUDIT_SUBJ_TYPE); break; case Opt_digest_type: ima_log_string(ab, "digest_type", args[0].from); if (entry->flags & IMA_DIGSIG_REQUIRED) result = -EINVAL; else if ((strcmp(args[0].from, "verity")) == 0) entry->flags |= IMA_VERITY_REQUIRED; else result = -EINVAL; break; case Opt_appraise_type: ima_log_string(ab, "appraise_type", args[0].from); if ((strcmp(args[0].from, "imasig")) == 0) { if (entry->flags & IMA_VERITY_REQUIRED) result = -EINVAL; else entry->flags |= IMA_DIGSIG_REQUIRED | IMA_CHECK_BLACKLIST; } else if (strcmp(args[0].from, "sigv3") == 0) { /* Only fsverity supports sigv3 for now */ if (entry->flags & IMA_VERITY_REQUIRED) entry->flags |= IMA_DIGSIG_REQUIRED | IMA_CHECK_BLACKLIST; else result = -EINVAL; } else if (IS_ENABLED(CONFIG_IMA_APPRAISE_MODSIG) && strcmp(args[0].from, "imasig|modsig") == 0) { if (entry->flags & IMA_VERITY_REQUIRED) result = -EINVAL; else entry->flags |= IMA_DIGSIG_REQUIRED | IMA_MODSIG_ALLOWED | IMA_CHECK_BLACKLIST; } else { result = -EINVAL; } break; case Opt_appraise_flag: ima_log_string(ab, "appraise_flag", args[0].from); break; case Opt_appraise_algos: ima_log_string(ab, "appraise_algos", args[0].from); if (entry->allowed_algos) { result = -EINVAL; break; } entry->allowed_algos = ima_parse_appraise_algos(args[0].from); /* invalid or empty list of algorithms */ if (!entry->allowed_algos) { result = -EINVAL; break; } entry->flags |= IMA_VALIDATE_ALGOS; break; case Opt_permit_directio: entry->flags |= IMA_PERMIT_DIRECTIO; break; case Opt_pcr: ima_log_string(ab, "pcr", args[0].from); result = kstrtoint(args[0].from, 10, &entry->pcr); if (result || INVALID_PCR(entry->pcr)) result = -EINVAL; else entry->flags |= IMA_PCR; break; case Opt_template: ima_log_string(ab, "template", args[0].from); if (entry->action != MEASURE) { result = -EINVAL; break; } template_desc = lookup_template_desc(args[0].from); if (!template_desc || entry->template) { result = -EINVAL; break; } /* * template_desc_init_fields() does nothing if * the template is already initialised, so * it's safe to do this unconditionally */ template_desc_init_fields(template_desc->fmt, &(template_desc->fields), &(template_desc->num_fields)); entry->template = template_desc; break; case Opt_err: ima_log_string(ab, "UNKNOWN", p); result = -EINVAL; break; } } if (!result && !ima_validate_rule(entry)) result = -EINVAL; else if (entry->action == APPRAISE) temp_ima_appraise |= ima_appraise_flag(entry->func); if (!result && entry->flags & IMA_MODSIG_ALLOWED) { template_desc = entry->template ? entry->template : ima_template_desc_current(); check_template_modsig(template_desc); } /* d-ngv2 template field recommended for unsigned fs-verity digests */ if (!result && entry->action == MEASURE && entry->flags & IMA_VERITY_REQUIRED) { template_desc = entry->template ? entry->template : ima_template_desc_current(); check_template_field(template_desc, "d-ngv2", "verity rules should include d-ngv2"); } audit_log_format(ab, "res=%d", !result); audit_log_end(ab); return result; } /** * ima_parse_add_rule - add a rule to ima_policy_rules * @rule: ima measurement policy rule * * Avoid locking by allowing just one writer at a time in ima_write_policy() * Returns the length of the rule parsed, an error code on failure */ ssize_t ima_parse_add_rule(char *rule) { static const char op[] = "update_policy"; char *p; struct ima_rule_entry *entry; ssize_t result, len; int audit_info = 0; p = strsep(&rule, "\n"); len = strlen(p) + 1; p += strspn(p, " \t"); if (*p == '#' || *p == '\0') return len; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL, op, "-ENOMEM", -ENOMEM, audit_info); return -ENOMEM; } INIT_LIST_HEAD(&entry->list); result = ima_parse_rule(p, entry); if (result) { ima_free_rule(entry); integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL, op, "invalid-policy", result, audit_info); return result; } list_add_tail(&entry->list, &ima_temp_rules); return len; } /** * ima_delete_rules() - called to cleanup invalid in-flight policy. * * We don't need locking as we operate on the temp list, which is * different from the active one. There is also only one user of * ima_delete_rules() at a time. */ void ima_delete_rules(void) { struct ima_rule_entry *entry, *tmp; temp_ima_appraise = 0; list_for_each_entry_safe(entry, tmp, &ima_temp_rules, list) { list_del(&entry->list); ima_free_rule(entry); } } #define __ima_hook_stringify(func, str) (#func), const char *const func_tokens[] = { __ima_hooks(__ima_hook_stringify) }; #ifdef CONFIG_IMA_READ_POLICY enum { mask_exec = 0, mask_write, mask_read, mask_append }; static const char *const mask_tokens[] = { "^MAY_EXEC", "^MAY_WRITE", "^MAY_READ", "^MAY_APPEND" }; void *ima_policy_start(struct seq_file *m, loff_t *pos) { loff_t l = *pos; struct ima_rule_entry *entry; struct list_head *ima_rules_tmp; rcu_read_lock(); ima_rules_tmp = rcu_dereference(ima_rules); list_for_each_entry_rcu(entry, ima_rules_tmp, list) { if (!l--) { rcu_read_unlock(); return entry; } } rcu_read_unlock(); return NULL; } void *ima_policy_next(struct seq_file *m, void *v, loff_t *pos) { struct ima_rule_entry *entry = v; rcu_read_lock(); entry = list_entry_rcu(entry->list.next, struct ima_rule_entry, list); rcu_read_unlock(); (*pos)++; return (&entry->list == &ima_default_rules || &entry->list == &ima_policy_rules) ? NULL : entry; } void ima_policy_stop(struct seq_file *m, void *v) { } #define pt(token) policy_tokens[token].pattern #define mt(token) mask_tokens[token] /* * policy_func_show - display the ima_hooks policy rule */ static void policy_func_show(struct seq_file *m, enum ima_hooks func) { if (func > 0 && func < MAX_CHECK) seq_printf(m, "func=%s ", func_tokens[func]); else seq_printf(m, "func=%d ", func); } static void ima_show_rule_opt_list(struct seq_file *m, const struct ima_rule_opt_list *opt_list) { size_t i; for (i = 0; i < opt_list->count; i++) seq_printf(m, "%s%s", i ? "|" : "", opt_list->items[i]); } static void ima_policy_show_appraise_algos(struct seq_file *m, unsigned int allowed_hashes) { int idx, list_size = 0; for (idx = 0; idx < HASH_ALGO__LAST; idx++) { if (!(allowed_hashes & (1U << idx))) continue; /* only add commas if the list contains multiple entries */ if (list_size++) seq_puts(m, ","); seq_puts(m, hash_algo_name[idx]); } } int ima_policy_show(struct seq_file *m, void *v) { struct ima_rule_entry *entry = v; int i; char tbuf[64] = {0,}; int offset = 0; rcu_read_lock(); /* Do not print rules with inactive LSM labels */ for (i = 0; i < MAX_LSM_RULES; i++) { if (entry->lsm[i].args_p && !entry->lsm[i].rule) { rcu_read_unlock(); return 0; } } if (entry->action & MEASURE) seq_puts(m, pt(Opt_measure)); if (entry->action & DONT_MEASURE) seq_puts(m, pt(Opt_dont_measure)); if (entry->action & APPRAISE) seq_puts(m, pt(Opt_appraise)); if (entry->action & DONT_APPRAISE) seq_puts(m, pt(Opt_dont_appraise)); if (entry->action & AUDIT) seq_puts(m, pt(Opt_audit)); if (entry->action & HASH) seq_puts(m, pt(Opt_hash)); if (entry->action & DONT_HASH) seq_puts(m, pt(Opt_dont_hash)); seq_puts(m, " "); if (entry->flags & IMA_FUNC) policy_func_show(m, entry->func); if ((entry->flags & IMA_MASK) || (entry->flags & IMA_INMASK)) { if (entry->flags & IMA_MASK) offset = 1; if (entry->mask & MAY_EXEC) seq_printf(m, pt(Opt_mask), mt(mask_exec) + offset); if (entry->mask & MAY_WRITE) seq_printf(m, pt(Opt_mask), mt(mask_write) + offset); if (entry->mask & MAY_READ) seq_printf(m, pt(Opt_mask), mt(mask_read) + offset); if (entry->mask & MAY_APPEND) seq_printf(m, pt(Opt_mask), mt(mask_append) + offset); seq_puts(m, " "); } if (entry->flags & IMA_FSMAGIC) { snprintf(tbuf, sizeof(tbuf), "0x%lx", entry->fsmagic); seq_printf(m, pt(Opt_fsmagic), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_FSNAME) { snprintf(tbuf, sizeof(tbuf), "%s", entry->fsname); seq_printf(m, pt(Opt_fsname), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_KEYRINGS) { seq_puts(m, "keyrings="); ima_show_rule_opt_list(m, entry->keyrings); seq_puts(m, " "); } if (entry->flags & IMA_LABEL) { seq_puts(m, "label="); ima_show_rule_opt_list(m, entry->label); seq_puts(m, " "); } if (entry->flags & IMA_PCR) { snprintf(tbuf, sizeof(tbuf), "%d", entry->pcr); seq_printf(m, pt(Opt_pcr), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_FSUUID) { seq_printf(m, "fsuuid=%pU", &entry->fsuuid); seq_puts(m, " "); } if (entry->flags & IMA_UID) { snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->uid)); if (entry->uid_op == &uid_gt) seq_printf(m, pt(Opt_uid_gt), tbuf); else if (entry->uid_op == &uid_lt) seq_printf(m, pt(Opt_uid_lt), tbuf); else seq_printf(m, pt(Opt_uid_eq), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_EUID) { snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->uid)); if (entry->uid_op == &uid_gt) seq_printf(m, pt(Opt_euid_gt), tbuf); else if (entry->uid_op == &uid_lt) seq_printf(m, pt(Opt_euid_lt), tbuf); else seq_printf(m, pt(Opt_euid_eq), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_GID) { snprintf(tbuf, sizeof(tbuf), "%d", __kgid_val(entry->gid)); if (entry->gid_op == &gid_gt) seq_printf(m, pt(Opt_gid_gt), tbuf); else if (entry->gid_op == &gid_lt) seq_printf(m, pt(Opt_gid_lt), tbuf); else seq_printf(m, pt(Opt_gid_eq), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_EGID) { snprintf(tbuf, sizeof(tbuf), "%d", __kgid_val(entry->gid)); if (entry->gid_op == &gid_gt) seq_printf(m, pt(Opt_egid_gt), tbuf); else if (entry->gid_op == &gid_lt) seq_printf(m, pt(Opt_egid_lt), tbuf); else seq_printf(m, pt(Opt_egid_eq), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_FOWNER) { snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->fowner)); if (entry->fowner_op == &vfsuid_gt_kuid) seq_printf(m, pt(Opt_fowner_gt), tbuf); else if (entry->fowner_op == &vfsuid_lt_kuid) seq_printf(m, pt(Opt_fowner_lt), tbuf); else seq_printf(m, pt(Opt_fowner_eq), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_FGROUP) { snprintf(tbuf, sizeof(tbuf), "%d", __kgid_val(entry->fgroup)); if (entry->fgroup_op == &vfsgid_gt_kgid) seq_printf(m, pt(Opt_fgroup_gt), tbuf); else if (entry->fgroup_op == &vfsgid_lt_kgid) seq_printf(m, pt(Opt_fgroup_lt), tbuf); else seq_printf(m, pt(Opt_fgroup_eq), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_VALIDATE_ALGOS) { seq_puts(m, "appraise_algos="); ima_policy_show_appraise_algos(m, entry->allowed_algos); seq_puts(m, " "); } for (i = 0; i < MAX_LSM_RULES; i++) { if (entry->lsm[i].rule) { switch (i) { case LSM_OBJ_USER: seq_printf(m, pt(Opt_obj_user), entry->lsm[i].args_p); break; case LSM_OBJ_ROLE: seq_printf(m, pt(Opt_obj_role), entry->lsm[i].args_p); break; case LSM_OBJ_TYPE: seq_printf(m, pt(Opt_obj_type), entry->lsm[i].args_p); break; case LSM_SUBJ_USER: seq_printf(m, pt(Opt_subj_user), entry->lsm[i].args_p); break; case LSM_SUBJ_ROLE: seq_printf(m, pt(Opt_subj_role), entry->lsm[i].args_p); break; case LSM_SUBJ_TYPE: seq_printf(m, pt(Opt_subj_type), entry->lsm[i].args_p); break; } seq_puts(m, " "); } } if (entry->template) seq_printf(m, "template=%s ", entry->template->name); if (entry->flags & IMA_DIGSIG_REQUIRED) { if (entry->flags & IMA_VERITY_REQUIRED) seq_puts(m, "appraise_type=sigv3 "); else if (entry->flags & IMA_MODSIG_ALLOWED) seq_puts(m, "appraise_type=imasig|modsig "); else seq_puts(m, "appraise_type=imasig "); } if (entry->flags & IMA_VERITY_REQUIRED) seq_puts(m, "digest_type=verity "); if (entry->flags & IMA_PERMIT_DIRECTIO) seq_puts(m, "permit_directio "); rcu_read_unlock(); seq_puts(m, "\n"); return 0; } #endif /* CONFIG_IMA_READ_POLICY */ #if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING) /* * ima_appraise_signature: whether IMA will appraise a given function using * an IMA digital signature. This is restricted to cases where the kernel * has a set of built-in trusted keys in order to avoid an attacker simply * loading additional keys. */ bool ima_appraise_signature(enum kernel_read_file_id id) { struct ima_rule_entry *entry; bool found = false; enum ima_hooks func; struct list_head *ima_rules_tmp; if (id >= READING_MAX_ID) return false; if (id == READING_KEXEC_IMAGE && !(ima_appraise & IMA_APPRAISE_ENFORCE) && security_locked_down(LOCKDOWN_KEXEC)) return false; func = read_idmap[id] ?: FILE_CHECK; rcu_read_lock(); ima_rules_tmp = rcu_dereference(ima_rules); list_for_each_entry_rcu(entry, ima_rules_tmp, list) { if (entry->action != APPRAISE) continue; /* * A generic entry will match, but otherwise require that it * match the func we're looking for */ if (entry->func && entry->func != func) continue; /* * We require this to be a digital signature, not a raw IMA * hash. */ if (entry->flags & IMA_DIGSIG_REQUIRED) found = true; /* * We've found a rule that matches, so break now even if it * didn't require a digital signature - a later rule that does * won't override it, so would be a false positive. */ break; } rcu_read_unlock(); return found; } #endif /* CONFIG_IMA_APPRAISE && CONFIG_INTEGRITY_TRUSTED_KEYRING */ |
61 243 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | // SPDX-License-Identifier: GPL-2.0-only /* * Unified UUID/GUID definition * * Copyright (C) 2009, 2016 Intel Corp. * Huang Ying <ying.huang@intel.com> */ #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/uuid.h> #include <linux/random.h> const guid_t guid_null; EXPORT_SYMBOL(guid_null); const uuid_t uuid_null; EXPORT_SYMBOL(uuid_null); const u8 guid_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; const u8 uuid_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; /** * generate_random_uuid - generate a random UUID * @uuid: where to put the generated UUID * * Random UUID interface * * Used to create a Boot ID or a filesystem UUID/GUID, but can be * useful for other kernel drivers. */ void generate_random_uuid(unsigned char uuid[16]) { get_random_bytes(uuid, 16); /* Set UUID version to 4 --- truly random generation */ uuid[6] = (uuid[6] & 0x0F) | 0x40; /* Set the UUID variant to DCE */ uuid[8] = (uuid[8] & 0x3F) | 0x80; } EXPORT_SYMBOL(generate_random_uuid); void generate_random_guid(unsigned char guid[16]) { get_random_bytes(guid, 16); /* Set GUID version to 4 --- truly random generation */ guid[7] = (guid[7] & 0x0F) | 0x40; /* Set the GUID variant to DCE */ guid[8] = (guid[8] & 0x3F) | 0x80; } EXPORT_SYMBOL(generate_random_guid); static void __uuid_gen_common(__u8 b[16]) { get_random_bytes(b, 16); /* reversion 0b10 */ b[8] = (b[8] & 0x3F) | 0x80; } void guid_gen(guid_t *lu) { __uuid_gen_common(lu->b); /* version 4 : random generation */ lu->b[7] = (lu->b[7] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(guid_gen); void uuid_gen(uuid_t *bu) { __uuid_gen_common(bu->b); /* version 4 : random generation */ bu->b[6] = (bu->b[6] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(uuid_gen); /** * uuid_is_valid - checks if a UUID string is valid * @uuid: UUID string to check * * Description: * It checks if the UUID string is following the format: * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx * * where x is a hex digit. * * Return: true if input is valid UUID string. */ bool uuid_is_valid(const char *uuid) { unsigned int i; for (i = 0; i < UUID_STRING_LEN; i++) { if (i == 8 || i == 13 || i == 18 || i == 23) { if (uuid[i] != '-') return false; } else if (!isxdigit(uuid[i])) { return false; } } return true; } EXPORT_SYMBOL(uuid_is_valid); static int __uuid_parse(const char *uuid, __u8 b[16], const u8 ei[16]) { static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34}; unsigned int i; if (!uuid_is_valid(uuid)) return -EINVAL; for (i = 0; i < 16; i++) { int hi = hex_to_bin(uuid[si[i] + 0]); int lo = hex_to_bin(uuid[si[i] + 1]); b[ei[i]] = (hi << 4) | lo; } return 0; } int guid_parse(const char *uuid, guid_t *u) { return __uuid_parse(uuid, u->b, guid_index); } EXPORT_SYMBOL(guid_parse); int uuid_parse(const char *uuid, uuid_t *u) { return __uuid_parse(uuid, u->b, uuid_index); } EXPORT_SYMBOL(uuid_parse); |
3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 | // SPDX-License-Identifier: GPL-2.0-or-later /* * MPLS GSO Support * * Authors: Simon Horman (horms@verge.net.au) * * Based on: GSO portions of net/ipv4/gre.c */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/err.h> #include <linux/module.h> #include <linux/netdev_features.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/gso.h> #include <net/mpls.h> static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; netdev_features_t mpls_features; u16 mac_len = skb->mac_len; __be16 mpls_protocol; unsigned int mpls_hlen; if (!skb_inner_network_header_was_set(skb)) goto out; skb_reset_network_header(skb); mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb); if (unlikely(!mpls_hlen || mpls_hlen % MPLS_HLEN)) goto out; if (unlikely(!pskb_may_pull(skb, mpls_hlen))) goto out; /* Setup inner SKB. */ mpls_protocol = skb->protocol; skb->protocol = skb->inner_protocol; __skb_pull(skb, mpls_hlen); skb->mac_len = 0; skb_reset_mac_header(skb); /* Segment inner packet. */ mpls_features = skb->dev->mpls_features & features; segs = skb_mac_gso_segment(skb, mpls_features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, mpls_protocol, mpls_hlen, mac_offset, mac_len); goto out; } skb = segs; mpls_hlen += mac_len; do { skb->mac_len = mac_len; skb->protocol = mpls_protocol; skb_reset_inner_network_header(skb); __skb_push(skb, mpls_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); } while ((skb = skb->next)); out: return segs; } static struct packet_offload mpls_mc_offload __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_MC), .priority = 15, .callbacks = { .gso_segment = mpls_gso_segment, }, }; static struct packet_offload mpls_uc_offload __read_mostly = { .type = cpu_to_be16(ETH_P_MPLS_UC), .priority = 15, .callbacks = { .gso_segment = mpls_gso_segment, }, }; static int __init mpls_gso_init(void) { pr_info("MPLS GSO support\n"); dev_add_offload(&mpls_uc_offload); dev_add_offload(&mpls_mc_offload); return 0; } static void __exit mpls_gso_exit(void) { dev_remove_offload(&mpls_uc_offload); dev_remove_offload(&mpls_mc_offload); } module_init(mpls_gso_init); module_exit(mpls_gso_exit); MODULE_DESCRIPTION("MPLS GSO support"); MODULE_AUTHOR("Simon Horman <horms@verge.net.au>"); MODULE_LICENSE("GPL"); |
4 3 3887 3887 3885 3889 2209 2209 3886 349 3643 3655 3655 3648 292 3643 14 14 14 14 2661 2652 2667 2661 24 320 26 23 2031 11 217 390 21 1 5 257 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H #include <linux/atomic.h> #include <linux/huge_mm.h> #include <linux/mm_types.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/userfaultfd_k.h> #include <linux/swapops.h> /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? * @folio: The folio to test. * * We would like to get this info without a page flag, but the state * needs to survive until the folio is last deleted from the LRU, which * could be as far down as __page_cache_release. * * Return: An integer (not a boolean!) used to sort a folio onto the * right LRU list and to account folios correctly. * 1 if @folio is a regular filesystem backed page cache folio * or a lazily freed anonymous folio (e.g. via MADV_FREE). * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise * ram or swap backed folio. */ static inline int folio_is_file_lru(struct folio *folio) { return !folio_test_swapbacked(folio); } static inline int page_is_file_lru(struct page *page) { return folio_is_file_lru(page_folio(page)); } static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); lockdep_assert_held(&lruvec->lru_lock); WARN_ON_ONCE(nr_pages != (int)nr_pages); __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } /** * __folio_clear_lru_flags - Clear page lru flags before releasing a page. * @folio: The folio that was on lru and now has a zero reference. */ static __always_inline void __folio_clear_lru_flags(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio); __folio_clear_lru(folio); /* this shouldn't happen, so leave the flags to bad_page() */ if (folio_test_active(folio) && folio_test_unevictable(folio)) return; __folio_clear_active(folio); __folio_clear_unevictable(folio); } /** * folio_lru_list - Which LRU list should a folio be on? * @folio: The folio to test. * * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ static __always_inline enum lru_list folio_lru_list(struct folio *folio) { enum lru_list lru; VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); if (folio_test_unevictable(folio)) return LRU_UNEVICTABLE; lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; if (folio_test_active(folio)) lru += LRU_ACTIVE; return lru; } #ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN_ENABLED static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); } #else static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); } #endif static inline bool lru_gen_in_fault(void) { return current->in_lru_fault; } static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; } static inline int lru_hist_from_seq(unsigned long seq) { return seq % NR_HIST_GENS; } static inline int lru_tier_from_refs(int refs, bool workingset) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); /* see the comment on MAX_NR_TIERS */ return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); if (!(flags & BIT(PG_referenced))) return 0; /* * Return the total number of accesses including PG_referenced. Also see * the comment on LRU_REFS_FLAGS. */ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } static inline int folio_lru_gen(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) { unsigned long max_seq = lruvec->lrugen.max_seq; VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on MIN_NR_GENS */ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); } static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio, int old_gen, int new_gen) { int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); if (old_gen >= 0) WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], lrugen->nr_pages[old_gen][type][zone] - delta); if (new_gen >= 0) WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], lrugen->nr_pages[new_gen][type][zone] + delta); /* addition */ if (old_gen < 0) { if (lru_gen_is_active(lruvec, new_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, delta); return; } /* deletion */ if (new_gen < 0) { if (lru_gen_is_active(lruvec, old_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, -delta); return; } /* promotion */ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { __update_lru_size(lruvec, lru, zone, -delta); __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); } /* demotion requires isolation, e.g., lru_deactivate_fn() */ VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { int gen; int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; /* * +-----------------------------------+-----------------------------------+ * | Accessed through page tables and | Accessed through file descriptors | * | promoted by folio_update_gen() | and protected by folio_inc_gen() | * +-----------------------------------+-----------------------------------+ * | PG_active (set while isolated) | | * +-----------------+-----------------+-----------------+-----------------+ * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | * +-----------------------------------+-----------------------------------+ * |<---------- MIN_NR_GENS ---------->| | * |<---------------------------- MAX_NR_GENS ---------------------------->| */ if (folio_test_active(folio)) gen = MIN_NR_GENS - folio_test_workingset(folio); else if (reclaiming) gen = MAX_NR_GENS; else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) || (folio_test_reclaim(folio) && (folio_test_dirty(folio) || folio_test_writeback(folio)))) gen = MIN_NR_GENS; else gen = MAX_NR_GENS - folio_test_workingset(folio); return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); if (folio_test_unevictable(folio) || !lrugen->enabled) return false; seq = lru_gen_folio_seq(lruvec, folio, reclaiming); gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long flags; int gen = folio_lru_gen(folio); if (gen < 0) return false; VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); /* for folio_migrate_flags() */ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; lru_gen_update_size(lruvec, folio, gen, -1); list_del(&folio->lru); return true; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK; set_mask_bits(&new->flags, LRU_REFS_MASK, refs); } #else /* !CONFIG_LRU_GEN */ static inline bool lru_gen_enabled(void) { return false; } static inline bool lru_gen_in_fault(void) { return false; } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { } #endif /* CONFIG_LRU_GEN */ static __always_inline void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, false)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); if (lru != LRU_UNEVICTABLE) list_add(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, true)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); /* This is not expected to be used on LRU_UNEVICTABLE */ list_add_tail(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_del_folio(lruvec, folio, false)) return; if (lru != LRU_UNEVICTABLE) list_del(&folio->lru); update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } #ifdef CONFIG_ANON_VMA_NAME /* mmap_lock should be read-locked */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) { if (anon_name) kref_get(&anon_name->kref); } static inline void anon_vma_name_put(struct anon_vma_name *anon_name) { if (anon_name) kref_put(&anon_name->kref, anon_vma_name_free); } static inline struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) { /* Prevent anon_name refcount saturation early on */ if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { anon_vma_name_get(anon_name); return anon_name; } return anon_vma_name_alloc(anon_name->name); } static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { struct anon_vma_name *anon_name = anon_vma_name(orig_vma); if (anon_name) new_vma->anon_name = anon_vma_name_reuse(anon_name); } static inline void free_anon_vma_name(struct vm_area_struct *vma) { /* * Not using anon_vma_name because it generates a warning if mmap_lock * is not held, which might be the case here. */ anon_vma_name_put(vma->anon_name); } static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { if (anon_name1 == anon_name2) return true; return anon_name1 && anon_name2 && !strcmp(anon_name1->name, anon_name2->name); } #else /* CONFIG_ANON_VMA_NAME */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) {} static inline void free_anon_vma_name(struct vm_area_struct *vma) {} static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { return true; } #endif /* CONFIG_ANON_VMA_NAME */ static inline void init_tlb_flush_pending(struct mm_struct *mm) { atomic_set(&mm->tlb_flush_pending, 0); } static inline void inc_tlb_flush_pending(struct mm_struct *mm) { atomic_inc(&mm->tlb_flush_pending); /* * The only time this value is relevant is when there are indeed pages * to flush. And we'll only flush pages after changing them, which * requires the PTL. * * So the ordering here is: * * atomic_inc(&mm->tlb_flush_pending); * spin_lock(&ptl); * ... * set_pte_at(); * spin_unlock(&ptl); * * spin_lock(&ptl) * mm_tlb_flush_pending(); * .... * spin_unlock(&ptl); * * flush_tlb_range(); * atomic_dec(&mm->tlb_flush_pending); * * Where the increment if constrained by the PTL unlock, it thus * ensures that the increment is visible if the PTE modification is * visible. After all, if there is no PTE modification, nobody cares * about TLB flushes either. * * This very much relies on users (mm_tlb_flush_pending() and * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc * locks (PPC) the unlock of one doesn't order against the lock of * another PTL. * * The decrement is ordered by the flush_tlb_range(), such that * mm_tlb_flush_pending() will not return false unless all flushes have * completed. */ } static inline void dec_tlb_flush_pending(struct mm_struct *mm) { /* * See inc_tlb_flush_pending(). * * This cannot be smp_mb__before_atomic() because smp_mb() simply does * not order against TLB invalidate completion, which is what we need. * * Therefore we must rely on tlb_flush_*() to guarantee order. */ atomic_dec(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { /* * Must be called after having acquired the PTL; orders against that * PTLs release and therefore ensures that if we observe the modified * PTE we must also observe the increment from inc_tlb_flush_pending(). * * That is, it only guarantees to return true if there is a flush * pending for _this_ PTL. */ return atomic_read(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_nested(struct mm_struct *mm) { /* * Similar to mm_tlb_flush_pending(), we must have acquired the PTL * for which there is a TLB flush pending in order to guarantee * we've seen both that PTE modification and the increment. * * (no requirement on actually still holding the PTL, that is irrelevant) */ return atomic_read(&mm->tlb_flush_pending) > 1; } #ifdef CONFIG_MMU /* * Computes the pte marker to copy from the given source entry into dst_vma. * If no marker should be copied, returns 0. * The caller should insert a new pte created with make_pte_marker(). */ static inline pte_marker copy_pte_marker( swp_entry_t entry, struct vm_area_struct *dst_vma) { pte_marker srcm = pte_marker_get(entry); /* Always copy error entries. */ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); /* Only copy PTE markers if UFFD register matches. */ if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma)) dstm |= PTE_MARKER_UFFD_WP; return dstm; } #endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to * replace a none pte. NOTE! This should only be called when *pte is already * cleared so we will never accidentally replace something valuable. Meanwhile * none pte also means we are not demoting the pte so tlb flushed is not needed. * E.g., when pte cleared the caller should have taken care of the tlb flush. * * Must be called with pgtable lock held so that no thread will see the none * pte, and if they see it, they'll fault and serialize at the pgtable lock. * * Returns true if an uffd-wp pte was installed, false otherwise. */ static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { #ifdef CONFIG_PTE_MARKER_UFFD_WP bool arm_uffd_pte = false; /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(ptep_get(pte))); /* * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole * thing, because when zapping either it means it's dropping the * page, or in TTU where the present pte will be quickly replaced * with a swap pte. There's no way of leaking the bit. */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) return false; /* A uffd-wp wr-protected normal pte */ if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) arm_uffd_pte = true; /* * A uffd-wp wr-protected swap pte. Note: this should even cover an * existing pte marker with uffd-wp bit set. */ if (unlikely(pte_swp_uffd_wp_any(pteval))) arm_uffd_pte = true; if (unlikely(arm_uffd_pte)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); return true; } #endif return false; } static inline bool vma_has_recency(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) return false; return true; } #endif |
1 1 2 2 2 2 2 2 2 1 1 2 2 1 2 1 1 1 1 1 1 2 1 1 1 1 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 | // SPDX-License-Identifier: GPL-2.0-only /* * vDPA bus. * * Copyright (c) 2020, Red Hat. All rights reserved. * Author: Jason Wang <jasowang@redhat.com> * */ #include <linux/module.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/vdpa.h> #include <uapi/linux/vdpa.h> #include <net/genetlink.h> #include <linux/mod_devicetable.h> #include <linux/virtio_ids.h> static LIST_HEAD(mdev_head); /* A global mutex that protects vdpa management device and device level operations. */ static DECLARE_RWSEM(vdpa_dev_lock); static DEFINE_IDA(vdpa_index_ida); void vdpa_set_status(struct vdpa_device *vdev, u8 status) { down_write(&vdev->cf_lock); vdev->config->set_status(vdev, status); up_write(&vdev->cf_lock); } EXPORT_SYMBOL(vdpa_set_status); static struct genl_family vdpa_nl_family; static int vdpa_dev_probe(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver); const struct vdpa_config_ops *ops = vdev->config; u32 max_num, min_num = 1; int ret = 0; d->dma_mask = &d->coherent_dma_mask; ret = dma_set_mask_and_coherent(d, DMA_BIT_MASK(64)); if (ret) return ret; max_num = ops->get_vq_num_max(vdev); if (ops->get_vq_num_min) min_num = ops->get_vq_num_min(vdev); if (max_num < min_num) return -EINVAL; if (drv && drv->probe) ret = drv->probe(vdev); return ret; } static void vdpa_dev_remove(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver); if (drv && drv->remove) drv->remove(vdev); } static int vdpa_dev_match(struct device *dev, const struct device_driver *drv) { struct vdpa_device *vdev = dev_to_vdpa(dev); /* Check override first, and if set, only use the named driver */ if (vdev->driver_override) return strcmp(vdev->driver_override, drv->name) == 0; /* Currently devices must be supported by all vDPA bus drivers */ return 1; } static ssize_t driver_override_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct vdpa_device *vdev = dev_to_vdpa(dev); int ret; ret = driver_set_override(dev, &vdev->driver_override, buf, count); if (ret) return ret; return count; } static ssize_t driver_override_show(struct device *dev, struct device_attribute *attr, char *buf) { struct vdpa_device *vdev = dev_to_vdpa(dev); ssize_t len; device_lock(dev); len = sysfs_emit(buf, "%s\n", vdev->driver_override); device_unlock(dev); return len; } static DEVICE_ATTR_RW(driver_override); static struct attribute *vdpa_dev_attrs[] = { &dev_attr_driver_override.attr, NULL, }; static const struct attribute_group vdpa_dev_group = { .attrs = vdpa_dev_attrs, }; __ATTRIBUTE_GROUPS(vdpa_dev); static const struct bus_type vdpa_bus = { .name = "vdpa", .dev_groups = vdpa_dev_groups, .match = vdpa_dev_match, .probe = vdpa_dev_probe, .remove = vdpa_dev_remove, }; static void vdpa_release_dev(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); const struct vdpa_config_ops *ops = vdev->config; if (ops->free) ops->free(vdev); ida_free(&vdpa_index_ida, vdev->index); kfree(vdev->driver_override); kfree(vdev); } /** * __vdpa_alloc_device - allocate and initilaize a vDPA device * This allows driver to some prepartion after device is * initialized but before registered. * @parent: the parent device * @config: the bus operations that is supported by this device * @ngroups: number of groups supported by this device * @nas: number of address spaces supported by this device * @size: size of the parent structure that contains private data * @name: name of the vdpa device; optional. * @use_va: indicate whether virtual address must be used by this device * * Driver should use vdpa_alloc_device() wrapper macro instead of * using this directly. * * Return: Returns an error when parent/config/dma_dev is not set or fail to get * ida. */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, unsigned int ngroups, unsigned int nas, size_t size, const char *name, bool use_va) { struct vdpa_device *vdev; int err = -EINVAL; if (!config) goto err; if (!!config->dma_map != !!config->dma_unmap) goto err; /* It should only work for the device that use on-chip IOMMU */ if (use_va && !(config->dma_map || config->set_map)) goto err; err = -ENOMEM; vdev = kzalloc(size, GFP_KERNEL); if (!vdev) goto err; err = ida_alloc(&vdpa_index_ida, GFP_KERNEL); if (err < 0) goto err_ida; vdev->dev.bus = &vdpa_bus; vdev->dev.parent = parent; vdev->dev.release = vdpa_release_dev; vdev->index = err; vdev->config = config; vdev->features_valid = false; vdev->use_va = use_va; vdev->ngroups = ngroups; vdev->nas = nas; if (name) err = dev_set_name(&vdev->dev, "%s", name); else err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index); if (err) goto err_name; init_rwsem(&vdev->cf_lock); device_initialize(&vdev->dev); return vdev; err_name: ida_free(&vdpa_index_ida, vdev->index); err_ida: kfree(vdev); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(__vdpa_alloc_device); static int vdpa_name_match(struct device *dev, const void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); return (strcmp(dev_name(&vdev->dev), data) == 0); } static int __vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { struct device *dev; vdev->nvqs = nvqs; lockdep_assert_held(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match); if (dev) { put_device(dev); return -EEXIST; } return device_add(&vdev->dev); } /** * _vdpa_register_device - register a vDPA device with vdpa lock held * Caller must have a succeed call of vdpa_alloc_device() before. * Caller must invoke this routine in the management device dev_add() * callback after setting up valid mgmtdev for this vdpa device. * @vdev: the vdpa device to be registered to vDPA bus * @nvqs: number of virtqueues supported by this device * * Return: Returns an error when fail to add device to vDPA bus */ int _vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { if (!vdev->mdev) return -EINVAL; return __vdpa_register_device(vdev, nvqs); } EXPORT_SYMBOL_GPL(_vdpa_register_device); /** * vdpa_register_device - register a vDPA device * Callers must have a succeed call of vdpa_alloc_device() before. * @vdev: the vdpa device to be registered to vDPA bus * @nvqs: number of virtqueues supported by this device * * Return: Returns an error when fail to add to vDPA bus */ int vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { int err; down_write(&vdpa_dev_lock); err = __vdpa_register_device(vdev, nvqs); up_write(&vdpa_dev_lock); return err; } EXPORT_SYMBOL_GPL(vdpa_register_device); /** * _vdpa_unregister_device - unregister a vDPA device * Caller must invoke this routine as part of management device dev_del() * callback. * @vdev: the vdpa device to be unregisted from vDPA bus */ void _vdpa_unregister_device(struct vdpa_device *vdev) { lockdep_assert_held(&vdpa_dev_lock); WARN_ON(!vdev->mdev); device_unregister(&vdev->dev); } EXPORT_SYMBOL_GPL(_vdpa_unregister_device); /** * vdpa_unregister_device - unregister a vDPA device * @vdev: the vdpa device to be unregisted from vDPA bus */ void vdpa_unregister_device(struct vdpa_device *vdev) { down_write(&vdpa_dev_lock); device_unregister(&vdev->dev); up_write(&vdpa_dev_lock); } EXPORT_SYMBOL_GPL(vdpa_unregister_device); /** * __vdpa_register_driver - register a vDPA device driver * @drv: the vdpa device driver to be registered * @owner: module owner of the driver * * Return: Returns an err when fail to do the registration */ int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner) { drv->driver.bus = &vdpa_bus; drv->driver.owner = owner; return driver_register(&drv->driver); } EXPORT_SYMBOL_GPL(__vdpa_register_driver); /** * vdpa_unregister_driver - unregister a vDPA device driver * @drv: the vdpa device driver to be unregistered */ void vdpa_unregister_driver(struct vdpa_driver *drv) { driver_unregister(&drv->driver); } EXPORT_SYMBOL_GPL(vdpa_unregister_driver); /** * vdpa_mgmtdev_register - register a vdpa management device * * @mdev: Pointer to vdpa management device * vdpa_mgmtdev_register() register a vdpa management device which supports * vdpa device management. * Return: Returns 0 on success or failure when required callback ops are not * initialized. */ int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev) { if (!mdev->device || !mdev->ops || !mdev->ops->dev_add || !mdev->ops->dev_del) return -EINVAL; INIT_LIST_HEAD(&mdev->list); down_write(&vdpa_dev_lock); list_add_tail(&mdev->list, &mdev_head); up_write(&vdpa_dev_lock); return 0; } EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register); static int vdpa_match_remove(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_mgmt_dev *mdev = vdev->mdev; if (mdev == data) mdev->ops->dev_del(mdev, vdev); return 0; } void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev) { down_write(&vdpa_dev_lock); list_del(&mdev->list); /* Filter out all the entries belong to this management device and delete it. */ bus_for_each_dev(&vdpa_bus, NULL, mdev, vdpa_match_remove); up_write(&vdpa_dev_lock); } EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister); static void vdpa_get_config_unlocked(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len) { const struct vdpa_config_ops *ops = vdev->config; /* * Config accesses aren't supposed to trigger before features are set. * If it does happen we assume a legacy guest. */ if (!vdev->features_valid) vdpa_set_features_unlocked(vdev, 0); ops->get_config(vdev, offset, buf, len); } /** * vdpa_get_config - Get one or more device configuration fields. * @vdev: vdpa device to operate on * @offset: starting byte offset of the field * @buf: buffer pointer to read to * @len: length of the configuration fields in bytes */ void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len) { down_read(&vdev->cf_lock); vdpa_get_config_unlocked(vdev, offset, buf, len); up_read(&vdev->cf_lock); } EXPORT_SYMBOL_GPL(vdpa_get_config); /** * vdpa_set_config - Set one or more device configuration fields. * @vdev: vdpa device to operate on * @offset: starting byte offset of the field * @buf: buffer pointer to read from * @length: length of the configuration fields in bytes */ void vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf, unsigned int length) { down_write(&vdev->cf_lock); vdev->config->set_config(vdev, offset, buf, length); up_write(&vdev->cf_lock); } EXPORT_SYMBOL_GPL(vdpa_set_config); static bool mgmtdev_handle_match(const struct vdpa_mgmt_dev *mdev, const char *busname, const char *devname) { /* Bus name is optional for simulated management device, so ignore the * device with bus if bus attribute is provided. */ if ((busname && !mdev->device->bus) || (!busname && mdev->device->bus)) return false; if (!busname && strcmp(dev_name(mdev->device), devname) == 0) return true; if (busname && (strcmp(mdev->device->bus->name, busname) == 0) && (strcmp(dev_name(mdev->device), devname) == 0)) return true; return false; } static struct vdpa_mgmt_dev *vdpa_mgmtdev_get_from_attr(struct nlattr **attrs) { struct vdpa_mgmt_dev *mdev; const char *busname = NULL; const char *devname; if (!attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]) return ERR_PTR(-EINVAL); devname = nla_data(attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]); if (attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]) busname = nla_data(attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]); list_for_each_entry(mdev, &mdev_head, list) { if (mgmtdev_handle_match(mdev, busname, devname)) return mdev; } return ERR_PTR(-ENODEV); } static int vdpa_nl_mgmtdev_handle_fill(struct sk_buff *msg, const struct vdpa_mgmt_dev *mdev) { if (mdev->device->bus && nla_put_string(msg, VDPA_ATTR_MGMTDEV_BUS_NAME, mdev->device->bus->name)) return -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_MGMTDEV_DEV_NAME, dev_name(mdev->device))) return -EMSGSIZE; return 0; } static u64 vdpa_mgmtdev_get_classes(const struct vdpa_mgmt_dev *mdev, unsigned int *nclasses) { u64 supported_classes = 0; unsigned int n = 0; for (int i = 0; mdev->id_table[i].device; i++) { if (mdev->id_table[i].device > 63) continue; supported_classes |= BIT_ULL(mdev->id_table[i].device); n++; } if (nclasses) *nclasses = n; return supported_classes; } static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *msg, u32 portid, u32 seq, int flags) { void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_MGMTDEV_NEW); if (!hdr) return -EMSGSIZE; err = vdpa_nl_mgmtdev_handle_fill(msg, mdev); if (err) goto msg_err; if (nla_put_u64_64bit(msg, VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, vdpa_mgmtdev_get_classes(mdev, NULL), VDPA_ATTR_UNSPEC)) { err = -EMSGSIZE; goto msg_err; } if (nla_put_u32(msg, VDPA_ATTR_DEV_MGMTDEV_MAX_VQS, mdev->max_supported_vqs)) { err = -EMSGSIZE; goto msg_err; } if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_SUPPORTED_FEATURES, mdev->supported_features, VDPA_ATTR_PAD)) { err = -EMSGSIZE; goto msg_err; } genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_mgmtdev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_mgmt_dev *mdev; struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); mdev = vdpa_mgmtdev_get_from_attr(info->attrs); if (IS_ERR(mdev)) { up_read(&vdpa_dev_lock); NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified mgmt device"); err = PTR_ERR(mdev); goto out; } err = vdpa_mgmtdev_fill(mdev, msg, info->snd_portid, info->snd_seq, 0); up_read(&vdpa_dev_lock); if (err) goto out; err = genlmsg_reply(msg, info); return err; out: nlmsg_free(msg); return err; } static int vdpa_nl_cmd_mgmtdev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_mgmt_dev *mdev; int start = cb->args[0]; int idx = 0; int err; down_read(&vdpa_dev_lock); list_for_each_entry(mdev, &mdev_head, list) { if (idx < start) { idx++; continue; } err = vdpa_mgmtdev_fill(mdev, msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) goto out; idx++; } out: up_read(&vdpa_dev_lock); cb->args[0] = idx; return msg->len; } #define VDPA_DEV_NET_ATTRS_MASK (BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) | \ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) | \ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) /* * Bitmask for all per-device features: feature bits VIRTIO_TRANSPORT_F_START * through VIRTIO_TRANSPORT_F_END are unset, i.e. 0xfffffc000fffffff for * all 64bit features. If the features are extended beyond 64 bits, or new * "holes" are reserved for other type of features than per-device, this * macro would have to be updated. */ #define VIRTIO_DEVICE_F_MASK (~0ULL << (VIRTIO_TRANSPORT_F_END + 1) | \ ((1ULL << VIRTIO_TRANSPORT_F_START) - 1)) static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_dev_set_config config = {}; struct nlattr **nl_attrs = info->attrs; struct vdpa_mgmt_dev *mdev; unsigned int ncls = 0; const u8 *macaddr; const char *name; u64 classes; int err = 0; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]) { macaddr = nla_data(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]); memcpy(config.net.mac, macaddr, sizeof(config.net.mac)); config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR); } if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]) { config.net.mtu = nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]); config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU); } if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP]) { config.net.max_vq_pairs = nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP]); if (!config.net.max_vq_pairs) { NL_SET_ERR_MSG_MOD(info->extack, "At least one pair of VQs is required"); return -EINVAL; } config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP); } if (nl_attrs[VDPA_ATTR_DEV_FEATURES]) { u64 missing = 0x0ULL; config.device_features = nla_get_u64(nl_attrs[VDPA_ATTR_DEV_FEATURES]); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR] && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MAC))) missing |= BIT_ULL(VIRTIO_NET_F_MAC); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU] && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MTU))) missing |= BIT_ULL(VIRTIO_NET_F_MTU); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP] && config.net.max_vq_pairs > 1 && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MQ))) missing |= BIT_ULL(VIRTIO_NET_F_MQ); if (missing) { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Missing features 0x%llx for provided attributes", missing); return -EINVAL; } config.mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES); } /* Skip checking capability if user didn't prefer to configure any * device networking attributes. It is likely that user might have used * a device specific method to configure such attributes or using device * default attributes. */ if ((config.mask & VDPA_DEV_NET_ATTRS_MASK) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; down_write(&vdpa_dev_lock); mdev = vdpa_mgmtdev_get_from_attr(info->attrs); if (IS_ERR(mdev)) { NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified management device"); err = PTR_ERR(mdev); goto err; } if ((config.mask & mdev->config_attr_mask) != config.mask) { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Some provided attributes are not supported: 0x%llx", config.mask & ~mdev->config_attr_mask); err = -EOPNOTSUPP; goto err; } classes = vdpa_mgmtdev_get_classes(mdev, &ncls); if (config.mask & VDPA_DEV_NET_ATTRS_MASK && !(classes & BIT_ULL(VIRTIO_ID_NET))) { NL_SET_ERR_MSG_MOD(info->extack, "Network class attributes provided on unsupported management device"); err = -EINVAL; goto err; } if (!(config.mask & VDPA_DEV_NET_ATTRS_MASK) && config.mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES) && classes & BIT_ULL(VIRTIO_ID_NET) && ncls > 1 && config.device_features & VIRTIO_DEVICE_F_MASK) { NL_SET_ERR_MSG_MOD(info->extack, "Management device supports multi-class while device features specified are ambiguous"); err = -EINVAL; goto err; } err = mdev->ops->dev_add(mdev, name, &config); err: up_write(&vdpa_dev_lock); return err; } static int vdpa_nl_cmd_dev_del_set_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_mgmt_dev *mdev; struct vdpa_device *vdev; struct device *dev; const char *name; int err = 0; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); down_write(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "Only user created device can be deleted by user"); err = -EINVAL; goto mdev_err; } mdev = vdev->mdev; mdev->ops->dev_del(mdev, vdev); mdev_err: put_device(dev); dev_err: up_write(&vdpa_dev_lock); return err; } static int vdpa_dev_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { u16 max_vq_size; u16 min_vq_size = 1; u32 device_id; u32 vendor_id; void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_NEW); if (!hdr) return -EMSGSIZE; err = vdpa_nl_mgmtdev_handle_fill(msg, vdev->mdev); if (err) goto msg_err; device_id = vdev->config->get_device_id(vdev); vendor_id = vdev->config->get_vendor_id(vdev); max_vq_size = vdev->config->get_vq_num_max(vdev); if (vdev->config->get_vq_num_min) min_vq_size = vdev->config->get_vq_num_min(vdev); err = -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_VENDOR_ID, vendor_id)) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_MAX_VQS, vdev->nvqs)) goto msg_err; if (nla_put_u16(msg, VDPA_ATTR_DEV_MAX_VQ_SIZE, max_vq_size)) goto msg_err; if (nla_put_u16(msg, VDPA_ATTR_DEV_MIN_VQ_SIZE, min_vq_size)) goto msg_err; genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_dev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { err = -EINVAL; goto mdev_err; } err = vdpa_dev_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack); if (err) goto mdev_err; err = genlmsg_reply(msg, info); put_device(dev); up_read(&vdpa_dev_lock); return err; mdev_err: put_device(dev); err: up_read(&vdpa_dev_lock); nlmsg_free(msg); return err; } struct vdpa_dev_dump_info { struct sk_buff *msg; struct netlink_callback *cb; int start_idx; int idx; }; static int vdpa_dev_dump(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_dev_dump_info *info = data; int err; if (!vdev->mdev) return 0; if (info->idx < info->start_idx) { info->idx++; return 0; } err = vdpa_dev_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid, info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack); if (err) return err; info->idx++; return 0; } static int vdpa_nl_cmd_dev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_dev_dump_info info; info.msg = msg; info.cb = cb; info.start_idx = cb->args[0]; info.idx = 0; down_read(&vdpa_dev_lock); bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_dump); up_read(&vdpa_dev_lock); cb->args[0] = info.idx; return msg->len; } static int vdpa_dev_net_mq_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_MQ)) == 0 && (features & BIT_ULL(VIRTIO_NET_F_RSS)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->max_virtqueue_pairs); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, val_u16); } static int vdpa_dev_net_mtu_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_MTU)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->mtu); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16); } static int vdpa_dev_net_mac_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { if ((features & BIT_ULL(VIRTIO_NET_F_MAC)) == 0) return 0; return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_MACADDR, sizeof(config->mac), config->mac); } static int vdpa_dev_net_status_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_STATUS)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->status); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16); } static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *msg) { struct virtio_net_config config = {}; u64 features_device; vdev->config->get_config(vdev, 0, &config, sizeof(config)); features_device = vdev->config->get_device_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device, VDPA_ATTR_PAD)) return -EMSGSIZE; if (vdpa_dev_net_mtu_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_net_mac_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_net_status_config_fill(msg, features_device, &config)) return -EMSGSIZE; return vdpa_dev_net_mq_config_fill(msg, features_device, &config); } static int vdpa_dev_blk_capacity_config_fill(struct sk_buff *msg, const struct virtio_blk_config *config) { u64 val_u64; val_u64 = __virtio64_to_cpu(true, config->capacity); return nla_put_u64_64bit(msg, VDPA_ATTR_DEV_BLK_CFG_CAPACITY, val_u64, VDPA_ATTR_PAD); } static int vdpa_dev_blk_seg_size_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_SIZE_MAX)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->size_max); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_SIZE_MAX, val_u32); } /* fill the block size*/ static int vdpa_dev_blk_block_size_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_BLK_SIZE)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->blk_size); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_BLK_SIZE, val_u32); } static int vdpa_dev_blk_seg_max_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_SEG_MAX)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->seg_max); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_SEG_MAX, val_u32); } static int vdpa_dev_blk_mq_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_BLK_F_MQ)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->num_queues); return nla_put_u16(msg, VDPA_ATTR_DEV_BLK_CFG_NUM_QUEUES, val_u16); } static int vdpa_dev_blk_topology_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u16 min_io_size; u32 opt_io_size; if ((features & BIT_ULL(VIRTIO_BLK_F_TOPOLOGY)) == 0) return 0; min_io_size = __virtio16_to_cpu(true, config->min_io_size); opt_io_size = __virtio32_to_cpu(true, config->opt_io_size); if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_CFG_PHY_BLK_EXP, config->physical_block_exp)) return -EMSGSIZE; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_CFG_ALIGN_OFFSET, config->alignment_offset)) return -EMSGSIZE; if (nla_put_u16(msg, VDPA_ATTR_DEV_BLK_CFG_MIN_IO_SIZE, min_io_size)) return -EMSGSIZE; if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_OPT_IO_SIZE, opt_io_size)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_discard_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_DISCARD)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->max_discard_sectors); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_DISCARD_SEC, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->max_discard_seg); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_DISCARD_SEG, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->discard_sector_alignment); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_DISCARD_SEC_ALIGN, val_u32)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_write_zeroes_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_WRITE_ZEROES)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->max_write_zeroes_sectors); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_WRITE_ZEROES_SEC, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->max_write_zeroes_seg); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_WRITE_ZEROES_SEG, val_u32)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_ro_config_fill(struct sk_buff *msg, u64 features) { u8 ro; ro = ((features & BIT_ULL(VIRTIO_BLK_F_RO)) == 0) ? 0 : 1; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_READ_ONLY, ro)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_flush_config_fill(struct sk_buff *msg, u64 features) { u8 flush; flush = ((features & BIT_ULL(VIRTIO_BLK_F_FLUSH)) == 0) ? 0 : 1; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_FLUSH, flush)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_config_fill(struct vdpa_device *vdev, struct sk_buff *msg) { struct virtio_blk_config config = {}; u64 features_device; vdev->config->get_config(vdev, 0, &config, sizeof(config)); features_device = vdev->config->get_device_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device, VDPA_ATTR_PAD)) return -EMSGSIZE; if (vdpa_dev_blk_capacity_config_fill(msg, &config)) return -EMSGSIZE; if (vdpa_dev_blk_seg_size_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_block_size_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_seg_max_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_mq_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_topology_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_discard_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_write_zeroes_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_ro_config_fill(msg, features_device)) return -EMSGSIZE; if (vdpa_dev_blk_flush_config_fill(msg, features_device)) return -EMSGSIZE; return 0; } static int vdpa_dev_config_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { u64 features_driver; u8 status = 0; u32 device_id; void *hdr; int err; down_read(&vdev->cf_lock); hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_CONFIG_GET); if (!hdr) { err = -EMSGSIZE; goto out; } if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) { err = -EMSGSIZE; goto msg_err; } device_id = vdev->config->get_device_id(vdev); if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) { err = -EMSGSIZE; goto msg_err; } /* only read driver features after the feature negotiation is done */ status = vdev->config->get_status(vdev); if (status & VIRTIO_CONFIG_S_FEATURES_OK) { features_driver = vdev->config->get_driver_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features_driver, VDPA_ATTR_PAD)) { err = -EMSGSIZE; goto msg_err; } } switch (device_id) { case VIRTIO_ID_NET: err = vdpa_dev_net_config_fill(vdev, msg); break; case VIRTIO_ID_BLOCK: err = vdpa_dev_blk_config_fill(vdev, msg); break; default: err = -EOPNOTSUPP; break; } if (err) goto msg_err; up_read(&vdev->cf_lock); genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); out: up_read(&vdev->cf_lock); return err; } static int vdpa_fill_stats_rec(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { struct virtio_net_config config = {}; u64 features; u8 status; int err; status = vdev->config->get_status(vdev); if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { NL_SET_ERR_MSG_MOD(info->extack, "feature negotiation not complete"); return -EAGAIN; } vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config)); features = vdev->config->get_driver_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features, VDPA_ATTR_PAD)) return -EMSGSIZE; err = vdpa_dev_net_mq_config_fill(msg, features, &config); if (err) return err; if (nla_put_u32(msg, VDPA_ATTR_DEV_QUEUE_INDEX, index)) return -EMSGSIZE; err = vdev->config->get_vendor_vq_stats(vdev, index, msg, info->extack); if (err) return err; return 0; } static int vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { int err; down_read(&vdev->cf_lock); if (!vdev->config->get_vendor_vq_stats) { err = -EOPNOTSUPP; goto out; } err = vdpa_fill_stats_rec(vdev, msg, info, index); out: up_read(&vdev->cf_lock); return err; } static int vdpa_dev_vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { u32 device_id; void *hdr; int err; u32 portid = info->snd_portid; u32 seq = info->snd_seq; u32 flags = 0; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_VSTATS_GET); if (!hdr) return -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) { err = -EMSGSIZE; goto undo_msg; } device_id = vdev->config->get_device_id(vdev); if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) { err = -EMSGSIZE; goto undo_msg; } switch (device_id) { case VIRTIO_ID_NET: if (index > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) { NL_SET_ERR_MSG_MOD(info->extack, "queue index exceeds max value"); err = -ERANGE; break; } err = vendor_stats_fill(vdev, msg, info, index); break; default: err = -EOPNOTSUPP; break; } genlmsg_end(msg, hdr); return err; undo_msg: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_dev_config_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); err = -EINVAL; goto mdev_err; } err = vdpa_dev_config_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack); if (!err) err = genlmsg_reply(msg, info); mdev_err: put_device(dev); dev_err: up_read(&vdpa_dev_lock); if (err) nlmsg_free(msg); return err; } static int vdpa_dev_net_device_attr_set(struct vdpa_device *vdev, struct genl_info *info) { struct vdpa_dev_set_config set_config = {}; struct vdpa_mgmt_dev *mdev = vdev->mdev; struct nlattr **nl_attrs = info->attrs; const u8 *macaddr; int err = -EOPNOTSUPP; down_write(&vdev->cf_lock); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]) { set_config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR); macaddr = nla_data(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]); if (is_valid_ether_addr(macaddr)) { ether_addr_copy(set_config.net.mac, macaddr); if (mdev->ops->dev_set_attr) { err = mdev->ops->dev_set_attr(mdev, vdev, &set_config); } else { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Operation not supported by the device."); } } else { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Invalid MAC address"); } } up_write(&vdev->cf_lock); return err; } static int vdpa_nl_cmd_dev_attr_set_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct device *dev; const char *name; u64 classes; int err = 0; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); down_write(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); err = -EINVAL; goto mdev_err; } classes = vdpa_mgmtdev_get_classes(vdev->mdev, NULL); if (classes & BIT_ULL(VIRTIO_ID_NET)) { err = vdpa_dev_net_device_attr_set(vdev, info); } else { NL_SET_ERR_MSG_FMT_MOD(info->extack, "%s device not supported", name); } mdev_err: put_device(dev); dev_err: up_write(&vdpa_dev_lock); return err; } static int vdpa_dev_config_dump(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_dev_dump_info *info = data; int err; if (!vdev->mdev) return 0; if (info->idx < info->start_idx) { info->idx++; return 0; } err = vdpa_dev_config_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid, info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack); if (err) return err; info->idx++; return 0; } static int vdpa_nl_cmd_dev_config_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_dev_dump_info info; info.msg = msg; info.cb = cb; info.start_idx = cb->args[0]; info.idx = 0; down_read(&vdpa_dev_lock); bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_config_dump); up_read(&vdpa_dev_lock); cb->args[0] = info.idx; return msg->len; } static int vdpa_nl_cmd_dev_stats_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; u32 index; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; if (!info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; index = nla_get_u32(info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]); down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); err = -EINVAL; goto mdev_err; } err = vdpa_dev_vendor_stats_fill(vdev, msg, info, index); if (err) goto mdev_err; err = genlmsg_reply(msg, info); put_device(dev); up_read(&vdpa_dev_lock); return err; mdev_err: put_device(dev); dev_err: nlmsg_free(msg); up_read(&vdpa_dev_lock); return err; } static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { [VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING }, [VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING }, [VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING }, [VDPA_ATTR_DEV_NET_CFG_MACADDR] = NLA_POLICY_ETH_ADDR, [VDPA_ATTR_DEV_NET_CFG_MAX_VQP] = { .type = NLA_U16 }, /* virtio spec 1.1 section 5.1.4.1 for valid MTU range */ [VDPA_ATTR_DEV_NET_CFG_MTU] = NLA_POLICY_MIN(NLA_U16, 68), [VDPA_ATTR_DEV_QUEUE_INDEX] = { .type = NLA_U32 }, [VDPA_ATTR_DEV_FEATURES] = { .type = NLA_U64 }, }; static const struct genl_ops vdpa_nl_ops[] = { { .cmd = VDPA_CMD_MGMTDEV_GET, .doit = vdpa_nl_cmd_mgmtdev_get_doit, .dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit, }, { .cmd = VDPA_CMD_DEV_NEW, .doit = vdpa_nl_cmd_dev_add_set_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_DEL, .doit = vdpa_nl_cmd_dev_del_set_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_GET, .doit = vdpa_nl_cmd_dev_get_doit, .dumpit = vdpa_nl_cmd_dev_get_dumpit, }, { .cmd = VDPA_CMD_DEV_CONFIG_GET, .doit = vdpa_nl_cmd_dev_config_get_doit, .dumpit = vdpa_nl_cmd_dev_config_get_dumpit, }, { .cmd = VDPA_CMD_DEV_VSTATS_GET, .doit = vdpa_nl_cmd_dev_stats_get_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_ATTR_SET, .doit = vdpa_nl_cmd_dev_attr_set_doit, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family vdpa_nl_family __ro_after_init = { .name = VDPA_GENL_NAME, .version = VDPA_GENL_VERSION, .maxattr = VDPA_ATTR_MAX, .policy = vdpa_nl_policy, .netnsok = false, .module = THIS_MODULE, .ops = vdpa_nl_ops, .n_ops = ARRAY_SIZE(vdpa_nl_ops), .resv_start_op = VDPA_CMD_DEV_VSTATS_GET + 1, }; static int vdpa_init(void) { int err; err = bus_register(&vdpa_bus); if (err) return err; err = genl_register_family(&vdpa_nl_family); if (err) goto err; return 0; err: bus_unregister(&vdpa_bus); return err; } static void __exit vdpa_exit(void) { genl_unregister_family(&vdpa_nl_family); bus_unregister(&vdpa_bus); ida_destroy(&vdpa_index_ida); } core_initcall(vdpa_init); module_exit(vdpa_exit); MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>"); MODULE_DESCRIPTION("vDPA bus"); MODULE_LICENSE("GPL v2"); |
55 55 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 | // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 debugfs for wireless PHYs * * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2018 - 2019, 2021-2024 Intel Corporation */ #include <linux/debugfs.h> #include <linux/rtnetlink.h> #include <linux/vmalloc.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "debugfs.h" #define DEBUGFS_FORMAT_BUFFER_SIZE 100 int mac80211_format_buffer(char __user *userbuf, size_t count, loff_t *ppos, char *fmt, ...) { va_list args; char buf[DEBUGFS_FORMAT_BUFFER_SIZE]; int res; va_start(args, fmt); res = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); return simple_read_from_buffer(userbuf, count, ppos, buf, res); } #define DEBUGFS_READONLY_FILE_FN(name, fmt, value...) \ static ssize_t name## _read(struct file *file, char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ struct ieee80211_local *local = file->private_data; \ \ return mac80211_format_buffer(userbuf, count, ppos, \ fmt "\n", ##value); \ } #define DEBUGFS_READONLY_FILE_OPS(name) \ static const struct debugfs_short_fops name## _ops = { \ .read = name## _read, \ .llseek = generic_file_llseek, \ }; #define DEBUGFS_READONLY_FILE(name, fmt, value...) \ DEBUGFS_READONLY_FILE_FN(name, fmt, value) \ DEBUGFS_READONLY_FILE_OPS(name) #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, phyd, local, &name## _ops) #define DEBUGFS_ADD_MODE(name, mode) \ debugfs_create_file(#name, mode, phyd, local, &name## _ops); DEBUGFS_READONLY_FILE(hw_conf, "%x", local->hw.conf.flags); DEBUGFS_READONLY_FILE(user_power, "%d", local->user_power_level); DEBUGFS_READONLY_FILE(power, "%d", local->hw.conf.power_level); DEBUGFS_READONLY_FILE(total_ps_buffered, "%d", local->total_ps_buffered); DEBUGFS_READONLY_FILE(wep_iv, "%#08x", local->wep_iv & 0xffffff); DEBUGFS_READONLY_FILE(rate_ctrl_alg, "%s", local->rate_ctrl ? local->rate_ctrl->ops->name : "hw/driver"); static ssize_t aqm_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; struct fq *fq = &local->fq; char buf[200]; int len = 0; spin_lock_bh(&local->fq.lock); rcu_read_lock(); len = scnprintf(buf, sizeof(buf), "access name value\n" "R fq_flows_cnt %u\n" "R fq_backlog %u\n" "R fq_overlimit %u\n" "R fq_overmemory %u\n" "R fq_collisions %u\n" "R fq_memory_usage %u\n" "RW fq_memory_limit %u\n" "RW fq_limit %u\n" "RW fq_quantum %u\n", fq->flows_cnt, fq->backlog, fq->overmemory, fq->overlimit, fq->collisions, fq->memory_usage, fq->memory_limit, fq->limit, fq->quantum); rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t aqm_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[100]; if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; if (count && buf[count - 1] == '\n') buf[count - 1] = '\0'; else buf[count] = '\0'; if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1) return count; else if (sscanf(buf, "fq_memory_limit %u", &local->fq.memory_limit) == 1) return count; else if (sscanf(buf, "fq_quantum %u", &local->fq.quantum) == 1) return count; return -EINVAL; } static const struct debugfs_short_fops aqm_ops = { .write = aqm_write, .read = aqm_read, .llseek = default_llseek, }; static ssize_t airtime_flags_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[128] = {}, *pos, *end; pos = buf; end = pos + sizeof(buf) - 1; if (local->airtime_flags & AIRTIME_USE_TX) pos += scnprintf(pos, end - pos, "AIRTIME_TX\t(%lx)\n", AIRTIME_USE_TX); if (local->airtime_flags & AIRTIME_USE_RX) pos += scnprintf(pos, end - pos, "AIRTIME_RX\t(%lx)\n", AIRTIME_USE_RX); return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); } static ssize_t airtime_flags_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[16]; if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; if (count && buf[count - 1] == '\n') buf[count - 1] = '\0'; else buf[count] = '\0'; if (kstrtou16(buf, 0, &local->airtime_flags)) return -EINVAL; return count; } static const struct debugfs_short_fops airtime_flags_ops = { .write = airtime_flags_write, .read = airtime_flags_read, .llseek = default_llseek, }; static ssize_t aql_pending_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[400]; int len = 0; len = scnprintf(buf, sizeof(buf), "AC AQL pending\n" "VO %u us\n" "VI %u us\n" "BE %u us\n" "BK %u us\n" "total %u us\n", atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_VO]), atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_VI]), atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_BE]), atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_BK]), atomic_read(&local->aql_total_pending_airtime)); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static const struct debugfs_short_fops aql_pending_ops = { .read = aql_pending_read, .llseek = default_llseek, }; static ssize_t aql_txq_limit_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[400]; int len = 0; len = scnprintf(buf, sizeof(buf), "AC AQL limit low AQL limit high\n" "VO %u %u\n" "VI %u %u\n" "BE %u %u\n" "BK %u %u\n", local->aql_txq_limit_low[IEEE80211_AC_VO], local->aql_txq_limit_high[IEEE80211_AC_VO], local->aql_txq_limit_low[IEEE80211_AC_VI], local->aql_txq_limit_high[IEEE80211_AC_VI], local->aql_txq_limit_low[IEEE80211_AC_BE], local->aql_txq_limit_high[IEEE80211_AC_BE], local->aql_txq_limit_low[IEEE80211_AC_BK], local->aql_txq_limit_high[IEEE80211_AC_BK]); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t aql_txq_limit_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[100]; u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old; struct sta_info *sta; if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; if (count && buf[count - 1] == '\n') buf[count - 1] = '\0'; else buf[count] = '\0'; if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3) return -EINVAL; if (ac >= IEEE80211_NUM_ACS) return -EINVAL; q_limit_low_old = local->aql_txq_limit_low[ac]; q_limit_high_old = local->aql_txq_limit_high[ac]; guard(wiphy)(local->hw.wiphy); local->aql_txq_limit_low[ac] = q_limit_low; local->aql_txq_limit_high[ac] = q_limit_high; list_for_each_entry(sta, &local->sta_list, list) { /* If a sta has customized queue limits, keep it */ if (sta->airtime[ac].aql_limit_low == q_limit_low_old && sta->airtime[ac].aql_limit_high == q_limit_high_old) { sta->airtime[ac].aql_limit_low = q_limit_low; sta->airtime[ac].aql_limit_high = q_limit_high; } } return count; } static const struct debugfs_short_fops aql_txq_limit_ops = { .write = aql_txq_limit_write, .read = aql_txq_limit_read, .llseek = default_llseek, }; static ssize_t aql_enable_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[3]; int len; len = scnprintf(buf, sizeof(buf), "%d\n", !static_key_false(&aql_disable.key)); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t aql_enable_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { bool aql_disabled = static_key_false(&aql_disable.key); char buf[3]; size_t len; if (count > sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; buf[sizeof(buf) - 1] = '\0'; len = strlen(buf); if (len > 0 && buf[len - 1] == '\n') buf[len - 1] = 0; if (buf[0] == '0' && buf[1] == '\0') { if (!aql_disabled) static_branch_inc(&aql_disable); } else if (buf[0] == '1' && buf[1] == '\0') { if (aql_disabled) static_branch_dec(&aql_disable); } else { return -EINVAL; } return count; } static const struct debugfs_short_fops aql_enable_ops = { .write = aql_enable_write, .read = aql_enable_read, .llseek = default_llseek, }; static ssize_t force_tx_status_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[3]; int len = 0; len = scnprintf(buf, sizeof(buf), "%d\n", (int)local->force_tx_status); return simple_read_from_buffer(user_buf, count, ppos, buf, len); } static ssize_t force_tx_status_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; char buf[3]; if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; if (count && buf[count - 1] == '\n') buf[count - 1] = '\0'; else buf[count] = '\0'; if (buf[0] == '0' && buf[1] == '\0') local->force_tx_status = 0; else if (buf[0] == '1' && buf[1] == '\0') local->force_tx_status = 1; else return -EINVAL; return count; } static const struct debugfs_short_fops force_tx_status_ops = { .write = force_tx_status_write, .read = force_tx_status_read, .llseek = default_llseek, }; #ifdef CONFIG_PM static ssize_t reset_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; int ret; rtnl_lock(); wiphy_lock(local->hw.wiphy); __ieee80211_suspend(&local->hw, NULL); ret = __ieee80211_resume(&local->hw); wiphy_unlock(local->hw.wiphy); if (ret) cfg80211_shutdown_all_interfaces(local->hw.wiphy); rtnl_unlock(); return count; } static const struct debugfs_short_fops reset_ops = { .write = reset_write, .llseek = noop_llseek, }; #endif static const char *hw_flag_names[] = { #define FLAG(F) [IEEE80211_HW_##F] = #F FLAG(HAS_RATE_CONTROL), FLAG(RX_INCLUDES_FCS), FLAG(HOST_BROADCAST_PS_BUFFERING), FLAG(SIGNAL_UNSPEC), FLAG(SIGNAL_DBM), FLAG(NEED_DTIM_BEFORE_ASSOC), FLAG(SPECTRUM_MGMT), FLAG(AMPDU_AGGREGATION), FLAG(SUPPORTS_PS), FLAG(PS_NULLFUNC_STACK), FLAG(SUPPORTS_DYNAMIC_PS), FLAG(MFP_CAPABLE), FLAG(WANT_MONITOR_VIF), FLAG(NO_VIRTUAL_MONITOR), FLAG(NO_AUTO_VIF), FLAG(SW_CRYPTO_CONTROL), FLAG(SUPPORT_FAST_XMIT), FLAG(REPORTS_TX_ACK_STATUS), FLAG(CONNECTION_MONITOR), FLAG(QUEUE_CONTROL), FLAG(SUPPORTS_PER_STA_GTK), FLAG(AP_LINK_PS), FLAG(TX_AMPDU_SETUP_IN_HW), FLAG(SUPPORTS_RC_TABLE), FLAG(P2P_DEV_ADDR_FOR_INTF), FLAG(TIMING_BEACON_ONLY), FLAG(SUPPORTS_HT_CCK_RATES), FLAG(CHANCTX_STA_CSA), FLAG(SUPPORTS_CLONED_SKBS), FLAG(SINGLE_SCAN_ON_ALL_BANDS), FLAG(TDLS_WIDER_BW), FLAG(SUPPORTS_AMSDU_IN_AMPDU), FLAG(BEACON_TX_STATUS), FLAG(NEEDS_UNIQUE_STA_ADDR), FLAG(SUPPORTS_REORDERING_BUFFER), FLAG(USES_RSS), FLAG(TX_AMSDU), FLAG(TX_FRAG_LIST), FLAG(REPORTS_LOW_ACK), FLAG(SUPPORTS_TX_FRAG), FLAG(SUPPORTS_TDLS_BUFFER_STA), FLAG(DOESNT_SUPPORT_QOS_NDP), FLAG(BUFF_MMPDU_TXQ), FLAG(SUPPORTS_VHT_EXT_NSS_BW), FLAG(STA_MMPDU_TXQ), FLAG(TX_STATUS_NO_AMPDU_LEN), FLAG(SUPPORTS_MULTI_BSSID), FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID), FLAG(AMPDU_KEYBORDER_SUPPORT), FLAG(SUPPORTS_TX_ENCAP_OFFLOAD), FLAG(SUPPORTS_RX_DECAP_OFFLOAD), FLAG(SUPPORTS_CONC_MON_RX_DECAP), FLAG(DETECTS_COLOR_COLLISION), FLAG(MLO_MCAST_MULTI_LINK_TX), FLAG(DISALLOW_PUNCTURING), FLAG(DISALLOW_PUNCTURING_5GHZ), FLAG(HANDLES_QUIET_CSA), #undef FLAG }; static ssize_t hwflags_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; size_t bufsz = 30 * NUM_IEEE80211_HW_FLAGS; char *buf = kzalloc(bufsz, GFP_KERNEL); char *pos = buf, *end = buf + bufsz - 1; ssize_t rv; int i; if (!buf) return -ENOMEM; /* fail compilation if somebody adds or removes * a flag without updating the name array above */ BUILD_BUG_ON(ARRAY_SIZE(hw_flag_names) != NUM_IEEE80211_HW_FLAGS); for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) { if (test_bit(i, local->hw.flags)) pos += scnprintf(pos, end - pos, "%s\n", hw_flag_names[i]); } rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); kfree(buf); return rv; } static ssize_t misc_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; /* Max len of each line is 16 characters, plus 9 for 'pending:\n' */ size_t bufsz = IEEE80211_MAX_QUEUES * 16 + 9; char *buf; char *pos, *end; ssize_t rv; int i; int ln; buf = kzalloc(bufsz, GFP_KERNEL); if (!buf) return -ENOMEM; pos = buf; end = buf + bufsz - 1; pos += scnprintf(pos, end - pos, "pending:\n"); for (i = 0; i < IEEE80211_MAX_QUEUES; i++) { ln = skb_queue_len(&local->pending[i]); pos += scnprintf(pos, end - pos, "[%i] %d\n", i, ln); } rv = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); kfree(buf); return rv; } static ssize_t queues_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct ieee80211_local *local = file->private_data; unsigned long flags; char buf[IEEE80211_MAX_QUEUES * 20]; int q, res = 0; spin_lock_irqsave(&local->queue_stop_reason_lock, flags); for (q = 0; q < local->hw.queues; q++) res += sprintf(buf + res, "%02d: %#.8lx/%d\n", q, local->queue_stop_reasons[q], skb_queue_len(&local->pending[q])); spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); return simple_read_from_buffer(user_buf, count, ppos, buf, res); } DEBUGFS_READONLY_FILE_OPS(hwflags); DEBUGFS_READONLY_FILE_OPS(queues); DEBUGFS_READONLY_FILE_OPS(misc); /* statistics stuff */ static ssize_t format_devstat_counter(struct ieee80211_local *local, char __user *userbuf, size_t count, loff_t *ppos, int (*printvalue)(struct ieee80211_low_level_stats *stats, char *buf, int buflen)) { struct ieee80211_low_level_stats stats; char buf[20]; int res; wiphy_lock(local->hw.wiphy); res = drv_get_stats(local, &stats); wiphy_unlock(local->hw.wiphy); if (res) return res; res = printvalue(&stats, buf, sizeof(buf)); return simple_read_from_buffer(userbuf, count, ppos, buf, res); } #define DEBUGFS_DEVSTATS_FILE(name) \ static int print_devstats_##name(struct ieee80211_low_level_stats *stats,\ char *buf, int buflen) \ { \ return scnprintf(buf, buflen, "%u\n", stats->name); \ } \ static ssize_t stats_ ##name## _read(struct file *file, \ char __user *userbuf, \ size_t count, loff_t *ppos) \ { \ return format_devstat_counter(file->private_data, \ userbuf, \ count, \ ppos, \ print_devstats_##name); \ } \ \ static const struct debugfs_short_fops stats_ ##name## _ops = { \ .read = stats_ ##name## _read, \ .llseek = generic_file_llseek, \ }; #ifdef CONFIG_MAC80211_DEBUG_COUNTERS #define DEBUGFS_STATS_ADD(name) \ debugfs_create_u32(#name, 0400, statsd, &local->name); #endif #define DEBUGFS_DEVSTATS_ADD(name) \ debugfs_create_file(#name, 0400, statsd, local, &stats_ ##name## _ops); DEBUGFS_DEVSTATS_FILE(dot11ACKFailureCount); DEBUGFS_DEVSTATS_FILE(dot11RTSFailureCount); DEBUGFS_DEVSTATS_FILE(dot11FCSErrorCount); DEBUGFS_DEVSTATS_FILE(dot11RTSSuccessCount); void debugfs_hw_add(struct ieee80211_local *local) { struct dentry *phyd = local->hw.wiphy->debugfsdir; struct dentry *statsd; if (!phyd) return; local->debugfs.keys = debugfs_create_dir("keys", phyd); DEBUGFS_ADD(total_ps_buffered); DEBUGFS_ADD(wep_iv); DEBUGFS_ADD(rate_ctrl_alg); DEBUGFS_ADD(queues); DEBUGFS_ADD(misc); #ifdef CONFIG_PM DEBUGFS_ADD_MODE(reset, 0200); #endif DEBUGFS_ADD(hwflags); DEBUGFS_ADD(user_power); DEBUGFS_ADD(power); DEBUGFS_ADD(hw_conf); DEBUGFS_ADD_MODE(force_tx_status, 0600); DEBUGFS_ADD_MODE(aql_enable, 0600); DEBUGFS_ADD(aql_pending); DEBUGFS_ADD_MODE(aqm, 0600); DEBUGFS_ADD_MODE(airtime_flags, 0600); DEBUGFS_ADD(aql_txq_limit); debugfs_create_u32("aql_threshold", 0600, phyd, &local->aql_threshold); statsd = debugfs_create_dir("statistics", phyd); #ifdef CONFIG_MAC80211_DEBUG_COUNTERS DEBUGFS_STATS_ADD(dot11TransmittedFragmentCount); DEBUGFS_STATS_ADD(dot11MulticastTransmittedFrameCount); DEBUGFS_STATS_ADD(dot11FailedCount); DEBUGFS_STATS_ADD(dot11RetryCount); DEBUGFS_STATS_ADD(dot11MultipleRetryCount); DEBUGFS_STATS_ADD(dot11FrameDuplicateCount); DEBUGFS_STATS_ADD(dot11ReceivedFragmentCount); DEBUGFS_STATS_ADD(dot11MulticastReceivedFrameCount); DEBUGFS_STATS_ADD(dot11TransmittedFrameCount); DEBUGFS_STATS_ADD(tx_handlers_drop); DEBUGFS_STATS_ADD(tx_handlers_queued); DEBUGFS_STATS_ADD(tx_handlers_drop_wep); DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc); DEBUGFS_STATS_ADD(tx_handlers_drop_unauth_port); DEBUGFS_STATS_ADD(rx_handlers_drop); DEBUGFS_STATS_ADD(rx_handlers_queued); DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc); DEBUGFS_STATS_ADD(rx_handlers_drop_defrag); DEBUGFS_STATS_ADD(tx_expand_skb_head); DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned); DEBUGFS_STATS_ADD(rx_expand_skb_head_defrag); DEBUGFS_STATS_ADD(rx_handlers_fragments); DEBUGFS_STATS_ADD(tx_status_drop); #endif DEBUGFS_DEVSTATS_ADD(dot11ACKFailureCount); DEBUGFS_DEVSTATS_ADD(dot11RTSFailureCount); DEBUGFS_DEVSTATS_ADD(dot11FCSErrorCount); DEBUGFS_DEVSTATS_ADD(dot11RTSSuccessCount); } |
22 19 19 19 3 3 3 3 3 3 3 558 676 242 558 556 558 558 1038 3 655 7 7 7 7 7 7 7 7 7 8 7 7 8 7 8 8 7 8 8 7 7 7 7 6 6 6 6 6 6 6 6 6 6 6 6 6 6 5 8 8 6 8 7 8 8 7 7 6 1034 13 1038 13 13 1030 1034 657 4 652 4 652 12 655 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 | // SPDX-License-Identifier: GPL-2.0 #include <linux/memcontrol.h> #include <linux/rwsem.h> #include <linux/shrinker.h> #include <linux/rculist.h> #include <trace/events/vmscan.h> #include "internal.h" LIST_HEAD(shrinker_list); DEFINE_MUTEX(shrinker_mutex); #ifdef CONFIG_MEMCG static int shrinker_nr_max; static inline int shrinker_unit_size(int nr_items) { return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *)); } static inline void shrinker_unit_free(struct shrinker_info *info, int start) { struct shrinker_info_unit **unit; int nr, i; if (!info) return; unit = info->unit; nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS); for (i = start; i < nr; i++) { if (!unit[i]) break; kfree(unit[i]); unit[i] = NULL; } } static inline int shrinker_unit_alloc(struct shrinker_info *new, struct shrinker_info *old, int nid) { struct shrinker_info_unit *unit; int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS); int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0; int i; for (i = start; i < nr; i++) { unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid); if (!unit) { shrinker_unit_free(new, start); return -ENOMEM; } new->unit[i] = unit; } return 0; } void free_shrinker_info(struct mem_cgroup *memcg) { struct mem_cgroup_per_node *pn; struct shrinker_info *info; int nid; for_each_node(nid) { pn = memcg->nodeinfo[nid]; info = rcu_dereference_protected(pn->shrinker_info, true); shrinker_unit_free(info, 0); kvfree(info); rcu_assign_pointer(pn->shrinker_info, NULL); } } int alloc_shrinker_info(struct mem_cgroup *memcg) { int nid, ret = 0; int array_size = 0; mutex_lock(&shrinker_mutex); array_size = shrinker_unit_size(shrinker_nr_max); for_each_node(nid) { struct shrinker_info *info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); if (!info) goto err; info->map_nr_max = shrinker_nr_max; if (shrinker_unit_alloc(info, NULL, nid)) { kvfree(info); goto err; } rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } mutex_unlock(&shrinker_mutex); return ret; err: mutex_unlock(&shrinker_mutex); free_shrinker_info(memcg); return -ENOMEM; } static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, lockdep_is_held(&shrinker_mutex)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size, int old_size, int new_nr_max) { struct shrinker_info *new, *old; struct mem_cgroup_per_node *pn; int nid; for_each_node(nid) { pn = memcg->nodeinfo[nid]; old = shrinker_info_protected(memcg, nid); /* Not yet online memcg */ if (!old) return 0; /* Already expanded this shrinker_info */ if (new_nr_max <= old->map_nr_max) continue; new = kvzalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid); if (!new) return -ENOMEM; new->map_nr_max = new_nr_max; memcpy(new->unit, old->unit, old_size); if (shrinker_unit_alloc(new, old, nid)) { kvfree(new); return -ENOMEM; } rcu_assign_pointer(pn->shrinker_info, new); kvfree_rcu(old, rcu); } return 0; } static int expand_shrinker_info(int new_id) { int ret = 0; int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS); int new_size, old_size = 0; struct mem_cgroup *memcg; if (!root_mem_cgroup) goto out; lockdep_assert_held(&shrinker_mutex); new_size = shrinker_unit_size(new_nr_max); old_size = shrinker_unit_size(shrinker_nr_max); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { ret = expand_one_shrinker_info(memcg, new_size, old_size, new_nr_max); if (ret) { mem_cgroup_iter_break(NULL, memcg); goto out; } } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); out: if (!ret) shrinker_nr_max = new_nr_max; return ret; } static inline int shrinker_id_to_index(int shrinker_id) { return shrinker_id / SHRINKER_UNIT_BITS; } static inline int shrinker_id_to_offset(int shrinker_id) { return shrinker_id % SHRINKER_UNIT_BITS; } static inline int calc_shrinker_id(int index, int offset) { return index * SHRINKER_UNIT_BITS + offset; } void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; struct shrinker_info_unit *unit; rcu_read_lock(); info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); unit = info->unit[shrinker_id_to_index(shrinker_id)]; if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); set_bit(shrinker_id_to_offset(shrinker_id), unit->map); } rcu_read_unlock(); } } static DEFINE_IDR(shrinker_idr); static int shrinker_memcg_alloc(struct shrinker *shrinker) { int id, ret = -ENOMEM; if (mem_cgroup_disabled()) return -ENOSYS; mutex_lock(&shrinker_mutex); id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; if (id >= shrinker_nr_max) { if (expand_shrinker_info(id)) { idr_remove(&shrinker_idr, id); goto unlock; } } shrinker->id = id; ret = 0; unlock: mutex_unlock(&shrinker_mutex); return ret; } static void shrinker_memcg_remove(struct shrinker *shrinker) { int id = shrinker->id; BUG_ON(id < 0); lockdep_assert_held(&shrinker_mutex); idr_remove(&shrinker_idr, id); } static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { struct shrinker_info *info; struct shrinker_info_unit *unit; long nr_deferred; rcu_read_lock(); info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); unit = info->unit[shrinker_id_to_index(shrinker->id)]; nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0); rcu_read_unlock(); return nr_deferred; } static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { struct shrinker_info *info; struct shrinker_info_unit *unit; long nr_deferred; rcu_read_lock(); info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); unit = info->unit[shrinker_id_to_index(shrinker->id)]; nr_deferred = atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]); rcu_read_unlock(); return nr_deferred; } void reparent_shrinker_deferred(struct mem_cgroup *memcg) { int nid, index, offset; long nr; struct mem_cgroup *parent; struct shrinker_info *child_info, *parent_info; struct shrinker_info_unit *child_unit, *parent_unit; parent = parent_mem_cgroup(memcg); if (!parent) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ mutex_lock(&shrinker_mutex); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) { child_unit = child_info->unit[index]; parent_unit = parent_info->unit[index]; for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) { nr = atomic_long_read(&child_unit->nr_deferred[offset]); atomic_long_add(nr, &parent_unit->nr_deferred[offset]); } } } mutex_unlock(&shrinker_mutex); } #else static int shrinker_memcg_alloc(struct shrinker *shrinker) { return -ENOSYS; } static void shrinker_memcg_remove(struct shrinker *shrinker) { } static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { return 0; } static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { return 0; } #endif /* CONFIG_MEMCG */ static long xchg_nr_deferred(struct shrinker *shrinker, struct shrink_control *sc) { int nid = sc->nid; if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) nid = 0; if (sc->memcg && (shrinker->flags & SHRINKER_MEMCG_AWARE)) return xchg_nr_deferred_memcg(nid, shrinker, sc->memcg); return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); } static long add_nr_deferred(long nr, struct shrinker *shrinker, struct shrink_control *sc) { int nid = sc->nid; if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) nid = 0; if (sc->memcg && (shrinker->flags & SHRINKER_MEMCG_AWARE)) return add_nr_deferred_memcg(nr, nid, shrinker, sc->memcg); return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); } #define SHRINK_BATCH 128 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, struct shrinker *shrinker, int priority) { unsigned long freed = 0; unsigned long long delta; long total_scan; long freeable; long nr; long new_nr; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; long scanned = 0, next_deferred; freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0 || freeable == SHRINK_EMPTY) return freeable; /* * copy the current shrinker scan count into a local variable * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ nr = xchg_nr_deferred(shrinker, shrinkctl); if (shrinker->seeks) { delta = freeable >> priority; delta *= 4; do_div(delta, shrinker->seeks); } else { /* * These objects don't require any IO to create. Trim * them aggressively under memory pressure to keep * them from causing refetches in the IO caches. */ delta = freeable / 2; } total_scan = nr >> priority; total_scan += delta; total_scan = min(total_scan, (2 * freeable)); trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, freeable, delta, total_scan, priority); /* * Normally, we should not scan less than batch_size objects in one * pass to avoid too frequent shrinker calls, but if the slab has less * than batch_size objects in total and we are really tight on memory, * we will try to reclaim all available objects, otherwise we can end * up failing allocations although there are plenty of reclaimable * objects spread over several slabs with usage less than the * batch_size. * * We detect the "tight on memory" situations by looking at the total * number of objects we want to scan (total_scan). If it is greater * than the total number of objects on slab (freeable), we must be * scanning at high prio and therefore should try to reclaim as much as * possible. */ while (total_scan >= batch_size || total_scan >= freeable) { unsigned long ret; unsigned long nr_to_scan = min(batch_size, total_scan); shrinkctl->nr_to_scan = nr_to_scan; shrinkctl->nr_scanned = nr_to_scan; ret = shrinker->scan_objects(shrinker, shrinkctl); if (ret == SHRINK_STOP) break; freed += ret; count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); total_scan -= shrinkctl->nr_scanned; scanned += shrinkctl->nr_scanned; cond_resched(); } /* * The deferred work is increased by any new work (delta) that wasn't * done, decreased by old deferred work that was done now. * * And it is capped to two times of the freeable items. */ next_deferred = max_t(long, (nr + delta - scanned), 0); next_deferred = min(next_deferred, (2 * freeable)); /* * move the unused scan count back into the shrinker in a * manner that handles concurrent updates. */ new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); return freed; } #ifdef CONFIG_MEMCG static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { struct shrinker_info *info; unsigned long ret, freed = 0; int offset, index = 0; if (!mem_cgroup_online(memcg)) return 0; /* * lockless algorithm of memcg shrink. * * The shrinker_info may be freed asynchronously via RCU in the * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used * to ensure the existence of the shrinker_info. * * The shrinker_info_unit is never freed unless its corresponding memcg * is destroyed. Here we already hold the refcount of memcg, so the * memcg will not be destroyed, and of course shrinker_info_unit will * not be freed. * * So in the memcg shrink: * step 1: use rcu_read_lock() to guarantee existence of the * shrinker_info. * step 2: after getting shrinker_info_unit we can safely release the * RCU lock. * step 3: traverse the bitmap and calculate shrinker_id * step 4: use rcu_read_lock() to guarantee existence of the shrinker. * step 5: use shrinker_id to find the shrinker, then use * shrinker_try_get() to guarantee existence of the shrinker, * then we can release the RCU lock to do do_shrink_slab() that * may sleep. * step 6: do shrinker_put() paired with step 5 to put the refcount, * if the refcount reaches 0, then wake up the waiter in * shrinker_free() by calling complete(). * Note: here is different from the global shrink, we don't * need to acquire the RCU lock to guarantee existence of * the shrinker, because we don't need to use this * shrinker to traverse the next shrinker in the bitmap. * step 7: we have already exited the read-side of rcu critical section * before calling do_shrink_slab(), the shrinker_info may be * released in expand_one_shrinker_info(), so go back to step 1 * to reacquire the shrinker_info. */ again: rcu_read_lock(); info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); if (unlikely(!info)) goto unlock; if (index < shrinker_id_to_index(info->map_nr_max)) { struct shrinker_info_unit *unit; unit = info->unit[index]; rcu_read_unlock(); for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, .memcg = memcg, }; struct shrinker *shrinker; int shrinker_id = calc_shrinker_id(index, offset); rcu_read_lock(); shrinker = idr_find(&shrinker_idr, shrinker_id); if (unlikely(!shrinker || !shrinker_try_get(shrinker))) { clear_bit(offset, unit->map); rcu_read_unlock(); continue; } rcu_read_unlock(); /* Call non-slab shrinkers even though kmem is disabled */ if (!memcg_kmem_online() && !(shrinker->flags & SHRINKER_NONSLAB)) continue; ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) { clear_bit(offset, unit->map); /* * After the shrinker reported that it had no objects to * free, but before we cleared the corresponding bit in * the memcg shrinker map, a new object might have been * added. To make sure, we have the bit set in this * case, we invoke the shrinker one more time and reset * the bit if it reports that it is not empty anymore. * The memory barrier here pairs with the barrier in * set_shrinker_bit(): * * list_lru_add() shrink_slab_memcg() * list_add_tail() clear_bit() * <MB> <MB> * set_bit() do_shrink_slab() */ smp_mb__after_atomic(); ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) ret = 0; else set_shrinker_bit(memcg, nid, shrinker_id); } freed += ret; shrinker_put(shrinker); } index++; goto again; } unlock: rcu_read_unlock(); return freed; } #else /* !CONFIG_MEMCG */ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { return 0; } #endif /* CONFIG_MEMCG */ /** * shrink_slab - shrink slab caches * @gfp_mask: allocation context * @nid: node whose slab caches to target * @memcg: memory cgroup whose slab caches to target * @priority: the reclaim priority * * Call the shrink functions to age shrinkable caches. * * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, * unaware shrinkers will receive a node id of 0 instead. * * @memcg specifies the memory cgroup to target. Unaware shrinkers * are called only if it is the root cgroup. * * @priority is sc->priority, we take the number of objects and >> by priority * in order to get the scan target. * * Returns the number of reclaimed slab objects. */ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { unsigned long ret, freed = 0; struct shrinker *shrinker; /* * The root memcg might be allocated even though memcg is disabled * via "cgroup_disable=memory" boot parameter. This could make * mem_cgroup_is_root() return false, then just run memcg slab * shrink, but skip global shrink. This may result in premature * oom. */ if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); /* * lockless algorithm of global shrink. * * In the unregistration setp, the shrinker will be freed asynchronously * via RCU after its refcount reaches 0. So both rcu_read_lock() and * shrinker_try_get() can be used to ensure the existence of the shrinker. * * So in the global shrink: * step 1: use rcu_read_lock() to guarantee existence of the shrinker * and the validity of the shrinker_list walk. * step 2: use shrinker_try_get() to try get the refcount, if successful, * then the existence of the shrinker can also be guaranteed, * so we can release the RCU lock to do do_shrink_slab() that * may sleep. * step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(), * which ensures that neither this shrinker nor the next shrinker * will be freed in the next traversal operation. * step 4: do shrinker_put() paired with step 2 to put the refcount, * if the refcount reaches 0, then wake up the waiter in * shrinker_free() by calling complete(). */ rcu_read_lock(); list_for_each_entry_rcu(shrinker, &shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, .memcg = memcg, }; if (!shrinker_try_get(shrinker)) continue; rcu_read_unlock(); ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) ret = 0; freed += ret; rcu_read_lock(); shrinker_put(shrinker); } rcu_read_unlock(); cond_resched(); return freed; } struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) { struct shrinker *shrinker; unsigned int size; va_list ap; int err; shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL); if (!shrinker) return NULL; va_start(ap, fmt); err = shrinker_debugfs_name_alloc(shrinker, fmt, ap); va_end(ap); if (err) goto err_name; shrinker->flags = flags | SHRINKER_ALLOCATED; shrinker->seeks = DEFAULT_SEEKS; if (flags & SHRINKER_MEMCG_AWARE) { err = shrinker_memcg_alloc(shrinker); if (err == -ENOSYS) { /* Memcg is not supported, fallback to non-memcg-aware shrinker. */ shrinker->flags &= ~SHRINKER_MEMCG_AWARE; goto non_memcg; } if (err) goto err_flags; return shrinker; } non_memcg: /* * The nr_deferred is available on per memcg level for memcg aware * shrinkers, so only allocate nr_deferred in the following cases: * - non-memcg-aware shrinkers * - !CONFIG_MEMCG * - memcg is disabled by kernel command line */ size = sizeof(*shrinker->nr_deferred); if (flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); if (!shrinker->nr_deferred) goto err_flags; return shrinker; err_flags: shrinker_debugfs_name_free(shrinker); err_name: kfree(shrinker); return NULL; } EXPORT_SYMBOL_GPL(shrinker_alloc); void shrinker_register(struct shrinker *shrinker) { if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) { pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker"); return; } mutex_lock(&shrinker_mutex); list_add_tail_rcu(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); mutex_unlock(&shrinker_mutex); init_completion(&shrinker->done); /* * Now the shrinker is fully set up, take the first reference to it to * indicate that lookup operations are now allowed to use it via * shrinker_try_get(). */ refcount_set(&shrinker->refcount, 1); } EXPORT_SYMBOL_GPL(shrinker_register); static void shrinker_free_rcu_cb(struct rcu_head *head) { struct shrinker *shrinker = container_of(head, struct shrinker, rcu); kfree(shrinker->nr_deferred); kfree(shrinker); } void shrinker_free(struct shrinker *shrinker) { struct dentry *debugfs_entry = NULL; int debugfs_id; if (!shrinker) return; if (shrinker->flags & SHRINKER_REGISTERED) { /* drop the initial refcount */ shrinker_put(shrinker); /* * Wait for all lookups of the shrinker to complete, after that, * no shrinker is running or will run again, then we can safely * free it asynchronously via RCU and safely free the structure * where the shrinker is located, such as super_block etc. */ wait_for_completion(&shrinker->done); } mutex_lock(&shrinker_mutex); if (shrinker->flags & SHRINKER_REGISTERED) { /* * Now we can safely remove it from the shrinker_list and then * free it. */ list_del_rcu(&shrinker->list); debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); shrinker->flags &= ~SHRINKER_REGISTERED; } shrinker_debugfs_name_free(shrinker); if (shrinker->flags & SHRINKER_MEMCG_AWARE) shrinker_memcg_remove(shrinker); mutex_unlock(&shrinker_mutex); if (debugfs_entry) shrinker_debugfs_remove(debugfs_entry, debugfs_id); call_rcu(&shrinker->rcu, shrinker_free_rcu_cb); } EXPORT_SYMBOL_GPL(shrinker_free); |
7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/buffer_head.h> #include <linux/delay.h> #include <linux/sort.h> #include <linux/hash.h> #include <linux/jhash.h> #include <linux/kallsyms.h> #include <linux/gfs2_ondisk.h> #include <linux/list.h> #include <linux/wait.h> #include <linux/module.h> #include <linux/uaccess.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/workqueue.h> #include <linux/jiffies.h> #include <linux/rcupdate.h> #include <linux/rculist_bl.h> #include <linux/bit_spinlock.h> #include <linux/percpu.h> #include <linux/list_sort.h> #include <linux/lockref.h> #include <linux/rhashtable.h> #include <linux/pid_namespace.h> #include <linux/file.h> #include <linux/random.h> #include "gfs2.h" #include "incore.h" #include "glock.h" #include "glops.h" #include "inode.h" #include "lops.h" #include "meta_io.h" #include "quota.h" #include "super.h" #include "util.h" #include "bmap.h" #define CREATE_TRACE_POINTS #include "trace_gfs2.h" struct gfs2_glock_iter { struct gfs2_sbd *sdp; /* incore superblock */ struct rhashtable_iter hti; /* rhashtable iterator */ struct gfs2_glock *gl; /* current glock struct */ loff_t last_pos; /* last position */ }; typedef void (*glock_examiner) (struct gfs2_glock * gl); static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); static void request_demote(struct gfs2_glock *gl, unsigned int state, unsigned long delay, bool remote); static struct dentry *gfs2_root; static LIST_HEAD(lru_list); static atomic_t lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 #define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT) static const struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, .key_len = offsetofend(struct lm_lockname, ln_type), .key_offset = offsetof(struct gfs2_glock, gl_name), .head_offset = offsetof(struct gfs2_glock, gl_node), }; static struct rhashtable gl_hash_table; #define GLOCK_WAIT_TABLE_BITS 12 #define GLOCK_WAIT_TABLE_SIZE (1 << GLOCK_WAIT_TABLE_BITS) static wait_queue_head_t glock_wait_table[GLOCK_WAIT_TABLE_SIZE] __cacheline_aligned; struct wait_glock_queue { struct lm_lockname *name; wait_queue_entry_t wait; }; static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync, void *key) { struct wait_glock_queue *wait_glock = container_of(wait, struct wait_glock_queue, wait); struct lm_lockname *wait_name = wait_glock->name; struct lm_lockname *wake_name = key; if (wake_name->ln_sbd != wait_name->ln_sbd || wake_name->ln_number != wait_name->ln_number || wake_name->ln_type != wait_name->ln_type) return 0; return autoremove_wake_function(wait, mode, sync, key); } static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name) { u32 hash = jhash2((u32 *)name, ht_parms.key_len / 4, 0); return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS); } /** * wake_up_glock - Wake up waiters on a glock * @gl: the glock */ static void wake_up_glock(struct gfs2_glock *gl) { wait_queue_head_t *wq = glock_waitqueue(&gl->gl_name); if (waitqueue_active(wq)) __wake_up(wq, TASK_NORMAL, 1, &gl->gl_name); } static void gfs2_glock_dealloc(struct rcu_head *rcu) { struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); kfree(gl->gl_lksb.sb_lvbptr); if (gl->gl_ops->go_flags & GLOF_ASPACE) { struct gfs2_glock_aspace *gla = container_of(gl, struct gfs2_glock_aspace, glock); kmem_cache_free(gfs2_glock_aspace_cachep, gla); } else kmem_cache_free(gfs2_glock_cachep, gl); } /** * glock_blocked_by_withdraw - determine if we can still use a glock * @gl: the glock * * We need to allow some glocks to be enqueued, dequeued, promoted, and demoted * when we're withdrawn. For example, to maintain metadata integrity, we should * disallow the use of inode and rgrp glocks when withdrawn. Other glocks like * the iopen or freeze glock may be safely used because none of their * metadata goes through the journal. So in general, we should disallow all * glocks that are journaled, and allow all the others. One exception is: * we need to allow our active journal to be promoted and demoted so others * may recover it and we can reacquire it when they're done. */ static bool glock_blocked_by_withdraw(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (!gfs2_withdrawing_or_withdrawn(sdp)) return false; if (gl->gl_ops->go_flags & GLOF_NONDISK) return false; if (!sdp->sd_jdesc || gl->gl_name.ln_number == sdp->sd_jdesc->jd_no_addr) return false; return true; } static void __gfs2_glock_free(struct gfs2_glock *gl) { rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); smp_mb(); wake_up_glock(gl); call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); } void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; __gfs2_glock_free(gl); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_kill_wait); } void gfs2_glock_free_later(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; spin_lock(&lru_lock); list_add(&gl->gl_lru, &sdp->sd_dead_glocks); spin_unlock(&lru_lock); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_kill_wait); } static void gfs2_free_dead_glocks(struct gfs2_sbd *sdp) { struct list_head *list = &sdp->sd_dead_glocks; while(!list_empty(list)) { struct gfs2_glock *gl; gl = list_first_entry(list, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); __gfs2_glock_free(gl); } } /** * gfs2_glock_hold() - increment reference count on glock * @gl: The glock to hold * */ struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl) { GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); lockref_get(&gl->gl_lockref); return gl; } static void gfs2_glock_add_to_lru(struct gfs2_glock *gl) { spin_lock(&lru_lock); list_move_tail(&gl->gl_lru, &lru_list); if (!test_bit(GLF_LRU, &gl->gl_flags)) { set_bit(GLF_LRU, &gl->gl_flags); atomic_inc(&lru_count); } spin_unlock(&lru_lock); } static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) { spin_lock(&lru_lock); if (test_bit(GLF_LRU, &gl->gl_flags)) { list_del_init(&gl->gl_lru); atomic_dec(&lru_count); clear_bit(GLF_LRU, &gl->gl_flags); } spin_unlock(&lru_lock); } /* * Enqueue the glock on the work queue. Passes one glock reference on to the * work queue. */ static void gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (!queue_delayed_work(sdp->sd_glock_wq, &gl->gl_work, delay)) { /* * We are holding the lockref spinlock, and the work was still * queued above. The queued work (glock_work_func) takes that * spinlock before dropping its glock reference(s), so it * cannot have dropped them in the meantime. */ GLOCK_BUG_ON(gl, gl->gl_lockref.count < 2); gl->gl_lockref.count--; } } static void __gfs2_glock_put(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct address_space *mapping = gfs2_glock2aspace(gl); lockref_mark_dead(&gl->gl_lockref); spin_unlock(&gl->gl_lockref.lock); gfs2_glock_remove_from_lru(gl); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); if (mapping) { truncate_inode_pages_final(mapping); if (!gfs2_withdrawing_or_withdrawn(sdp)) GLOCK_BUG_ON(gl, !mapping_empty(mapping)); } trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } static bool __gfs2_glock_put_or_lock(struct gfs2_glock *gl) { if (lockref_put_or_lock(&gl->gl_lockref)) return true; GLOCK_BUG_ON(gl, gl->gl_lockref.count != 1); if (gl->gl_state != LM_ST_UNLOCKED) { gl->gl_lockref.count--; gfs2_glock_add_to_lru(gl); spin_unlock(&gl->gl_lockref.lock); return true; } return false; } /** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put * */ void gfs2_glock_put(struct gfs2_glock *gl) { if (__gfs2_glock_put_or_lock(gl)) return; __gfs2_glock_put(gl); } /* * gfs2_glock_put_async - Decrement reference count without sleeping * @gl: The glock to put * * Decrement the reference count on glock immediately unless it is the last * reference. Defer putting the last reference to work queue context. */ void gfs2_glock_put_async(struct gfs2_glock *gl) { if (__gfs2_glock_put_or_lock(gl)) return; gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); } /** * may_grant - check if it's ok to grant a new lock * @gl: The glock * @current_gh: One of the current holders of @gl * @gh: The lock request which we wish to grant * * With our current compatibility rules, if a glock has one or more active * holders (HIF_HOLDER flag set), any of those holders can be passed in as * @current_gh; they are all the same as far as compatibility with the new @gh * goes. * * Returns true if it's ok to grant the lock. */ static inline bool may_grant(struct gfs2_glock *gl, struct gfs2_holder *current_gh, struct gfs2_holder *gh) { if (current_gh) { GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, ¤t_gh->gh_iflags)); switch(current_gh->gh_state) { case LM_ST_EXCLUSIVE: /* * Here we make a special exception to grant holders * who agree to share the EX lock with other holders * who also have the bit set. If the original holder * has the LM_FLAG_NODE_SCOPE bit set, we grant more * holders with the bit set. */ return gh->gh_state == LM_ST_EXCLUSIVE && (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) && (gh->gh_flags & LM_FLAG_NODE_SCOPE); case LM_ST_SHARED: case LM_ST_DEFERRED: return gh->gh_state == current_gh->gh_state; default: return false; } } if (gl->gl_state == gh->gh_state) return true; if (gh->gh_flags & GL_EXACT) return false; if (gl->gl_state == LM_ST_EXCLUSIVE) { return gh->gh_state == LM_ST_SHARED || gh->gh_state == LM_ST_DEFERRED; } if (gh->gh_flags & LM_FLAG_ANY) return gl->gl_state != LM_ST_UNLOCKED; return false; } static void gfs2_holder_wake(struct gfs2_holder *gh) { clear_bit(HIF_WAIT, &gh->gh_iflags); smp_mb__after_atomic(); wake_up_bit(&gh->gh_iflags, HIF_WAIT); if (gh->gh_flags & GL_ASYNC) { struct gfs2_sbd *sdp = gh->gh_gl->gl_name.ln_sbd; wake_up(&sdp->sd_async_glock_wait); } } /** * do_error - Something unexpected has happened during a lock request * @gl: The glock * @ret: The status from the DLM */ static void do_error(struct gfs2_glock *gl, const int ret) { struct gfs2_holder *gh, *tmp; list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; if (ret & LM_OUT_ERROR) gh->gh_error = -EIO; else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) gh->gh_error = GLR_TRYFAILED; else continue; list_del_init(&gh->gh_list); trace_gfs2_glock_queue(gh, 0); gfs2_holder_wake(gh); } } /** * find_first_holder - find the first "holder" gh * @gl: the glock */ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) { struct gfs2_holder *gh; if (!list_empty(&gl->gl_holders)) { gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list); if (test_bit(HIF_HOLDER, &gh->gh_iflags)) return gh; } return NULL; } /* * gfs2_instantiate - Call the glops instantiate function * @gh: The glock holder * * Returns: 0 if instantiate was successful, or error. */ int gfs2_instantiate(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; const struct gfs2_glock_operations *glops = gl->gl_ops; int ret; again: if (!test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags)) goto done; /* * Since we unlock the lockref lock, we set a flag to indicate * instantiate is in progress. */ if (test_and_set_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags)) { wait_on_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG, TASK_UNINTERRUPTIBLE); /* * Here we just waited for a different instantiate to finish. * But that may not have been successful, as when a process * locks an inode glock _before_ it has an actual inode to * instantiate into. So we check again. This process might * have an inode to instantiate, so might be successful. */ goto again; } ret = glops->go_instantiate(gl); if (!ret) clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags); clear_and_wake_up_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags); if (ret) return ret; done: if (glops->go_held) return glops->go_held(gh); return 0; } /** * do_promote - promote as many requests as possible on the current queue * @gl: The glock * * Returns true on success (i.e., progress was made or there are no waiters). */ static bool do_promote(struct gfs2_glock *gl) { struct gfs2_holder *gh, *current_gh; current_gh = find_first_holder(gl); list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; if (!may_grant(gl, current_gh, gh)) { /* * If we get here, it means we may not grant this * holder for some reason. If this holder is at the * head of the list, it means we have a blocked holder * at the head, so return false. */ if (list_is_first(&gh->gh_list, &gl->gl_holders)) return false; do_error(gl, 0); break; } set_bit(HIF_HOLDER, &gh->gh_iflags); trace_gfs2_promote(gh); gfs2_holder_wake(gh); if (!current_gh) current_gh = gh; } return true; } /** * find_first_waiter - find the first gh that's waiting for the glock * @gl: the glock */ static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl) { struct gfs2_holder *gh; list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) return gh; } return NULL; } /** * find_last_waiter - find the last gh that's waiting for the glock * @gl: the glock * * This also is a fast way of finding out if there are any waiters. */ static inline struct gfs2_holder *find_last_waiter(const struct gfs2_glock *gl) { struct gfs2_holder *gh; if (list_empty(&gl->gl_holders)) return NULL; gh = list_last_entry(&gl->gl_holders, struct gfs2_holder, gh_list); return test_bit(HIF_HOLDER, &gh->gh_iflags) ? NULL : gh; } /** * state_change - record that the glock is now in a different state * @gl: the glock * @new_state: the new state */ static void state_change(struct gfs2_glock *gl, unsigned int new_state) { if (new_state != gl->gl_target) /* shorten our minimum hold time */ gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR, GL_GLOCK_MIN_HOLD); gl->gl_state = new_state; gl->gl_tchange = jiffies; } static void gfs2_set_demote(int nr, struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; set_bit(nr, &gl->gl_flags); smp_mb(); wake_up(&sdp->sd_async_glock_wait); } static void gfs2_demote_wake(struct gfs2_glock *gl) { gl->gl_demote_state = LM_ST_EXCLUSIVE; clear_bit(GLF_DEMOTE, &gl->gl_flags); smp_mb__after_atomic(); wake_up_bit(&gl->gl_flags, GLF_DEMOTE); } /** * finish_xmote - The DLM has replied to one of our lock requests * @gl: The glock * @ret: The status from the DLM * */ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) { const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_holder *gh; unsigned state = ret & LM_OUT_ST_MASK; trace_gfs2_glock_state_change(gl, state); state_change(gl, state); gh = find_first_waiter(gl); /* Demote to UN request arrived during demote to SH or DF */ if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED) gl->gl_target = LM_ST_UNLOCKED; /* Check for state != intended state */ if (unlikely(state != gl->gl_target)) { if (gh && (ret & LM_OUT_CANCELED)) gfs2_holder_wake(gh); if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { /* move to back of queue and try next entry */ if (ret & LM_OUT_CANCELED) { list_move_tail(&gh->gh_list, &gl->gl_holders); gh = find_first_waiter(gl); gl->gl_target = gh->gh_state; if (do_promote(gl)) goto out; goto retry; } /* Some error or failed "try lock" - report it */ if ((ret & LM_OUT_ERROR) || (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { gl->gl_target = gl->gl_state; do_error(gl, ret); goto out; } } switch(state) { /* Unlocked due to conversion deadlock, try again */ case LM_ST_UNLOCKED: retry: do_xmote(gl, gh, gl->gl_target); break; /* Conversion fails, unlock and try again */ case LM_ST_SHARED: case LM_ST_DEFERRED: do_xmote(gl, gh, LM_ST_UNLOCKED); break; default: /* Everything else */ fs_err(gl->gl_name.ln_sbd, "wanted %u got %u\n", gl->gl_target, state); GLOCK_BUG_ON(gl, 1); } return; } /* Fast path - we got what we asked for */ if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) gfs2_demote_wake(gl); if (state != LM_ST_UNLOCKED) { if (glops->go_xmote_bh) { int rv; spin_unlock(&gl->gl_lockref.lock); rv = glops->go_xmote_bh(gl); spin_lock(&gl->gl_lockref.lock); if (rv) { do_error(gl, rv); goto out; } } do_promote(gl); } out: clear_bit(GLF_LOCK, &gl->gl_flags); } static bool is_system_glock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); if (gl == m_ip->i_gl) return true; return false; } /** * do_xmote - Calls the DLM to change the state of a lock * @gl: The lock state * @gh: The holder (only for promotes) * @target: The target lock state * */ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) && gh && !(gh->gh_flags & LM_FLAG_NOEXP)) goto skip_inval; lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP); GLOCK_BUG_ON(gl, gl->gl_state == target); GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && glops->go_inval) { /* * If another process is already doing the invalidate, let that * finish first. The glock state machine will get back to this * holder again later. */ if (test_and_set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) return; do_error(gl, 0); /* Fail queued try locks */ } gl->gl_req = target; set_bit(GLF_BLOCKING, &gl->gl_flags); if ((gl->gl_req == LM_ST_UNLOCKED) || (gl->gl_state == LM_ST_EXCLUSIVE) || (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) clear_bit(GLF_BLOCKING, &gl->gl_flags); if (!glops->go_inval && !glops->go_sync) goto skip_inval; spin_unlock(&gl->gl_lockref.lock); if (glops->go_sync) { ret = glops->go_sync(gl); /* If we had a problem syncing (due to io errors or whatever, * we should not invalidate the metadata or tell dlm to * release the glock to other nodes. */ if (ret) { if (cmpxchg(&sdp->sd_log_error, 0, ret)) { fs_err(sdp, "Error %d syncing glock \n", ret); gfs2_dump_glock(NULL, gl, true); } spin_lock(&gl->gl_lockref.lock); goto skip_inval; } } if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) { /* * The call to go_sync should have cleared out the ail list. * If there are still items, we have a problem. We ought to * withdraw, but we can't because the withdraw code also uses * glocks. Warn about the error, dump the glock, then fall * through and wait for logd to do the withdraw for us. */ if ((atomic_read(&gl->gl_ail_count) != 0) && (!cmpxchg(&sdp->sd_log_error, 0, -EIO))) { gfs2_glock_assert_warn(gl, !atomic_read(&gl->gl_ail_count)); gfs2_dump_glock(NULL, gl, true); } glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } spin_lock(&gl->gl_lockref.lock); skip_inval: gl->gl_lockref.count++; /* * Check for an error encountered since we called go_sync and go_inval. * If so, we can't withdraw from the glock code because the withdraw * code itself uses glocks (see function signal_our_withdraw) to * change the mount to read-only. Most importantly, we must not call * dlm to unlock the glock until the journal is in a known good state * (after journal replay) otherwise other nodes may use the object * (rgrp or dinode) and then later, journal replay will corrupt the * file system. The best we can do here is wait for the logd daemon * to see sd_log_error and withdraw, and in the meantime, requeue the * work for later. * * We make a special exception for some system glocks, such as the * system statfs inode glock, which needs to be granted before the * gfs2_quotad daemon can exit, and that exit needs to finish before * we can unmount the withdrawn file system. * * However, if we're just unlocking the lock (say, for unmount, when * gfs2_gl_hash_clear calls clear_glock) and recovery is complete * then it's okay to tell dlm to unlock it. */ if (unlikely(sdp->sd_log_error) && !gfs2_withdrawing_or_withdrawn(sdp)) gfs2_withdraw_delayed(sdp); if (glock_blocked_by_withdraw(gl) && (target != LM_ST_UNLOCKED || test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) { if (!is_system_glock(gl)) { request_demote(gl, LM_ST_UNLOCKED, 0, false); /* * Ordinarily, we would call dlm and its callback would call * finish_xmote, which would call state_change() to the new state. * Since we withdrew, we won't call dlm, so call state_change * manually, but to the UNLOCKED state we desire. */ state_change(gl, LM_ST_UNLOCKED); /* * We skip telling dlm to do the locking, so we won't get a * reply that would otherwise clear GLF_LOCK. So we clear it here. */ clear_bit(GLF_LOCK, &gl->gl_flags); clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); return; } else { clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } } if (ls->ls_ops->lm_lock) { spin_unlock(&gl->gl_lockref.lock); ret = ls->ls_ops->lm_lock(gl, target, lck_flags); spin_lock(&gl->gl_lockref.lock); if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED && target == LM_ST_UNLOCKED && test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { /* * The lockspace has been released and the lock has * been unlocked implicitly. */ } else if (ret) { fs_err(sdp, "lm_lock ret %d\n", ret); target = gl->gl_state | LM_OUT_ERROR; } else { /* The operation will be completed asynchronously. */ return; } } /* Complete the operation now. */ finish_xmote(gl, target); gfs2_glock_queue_work(gl, 0); } /** * run_queue - do all outstanding tasks related to a glock * @gl: The glock in question * @nonblock: True if we must not block in run_queue * */ static void run_queue(struct gfs2_glock *gl, const int nonblock) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { struct gfs2_holder *gh = NULL; if (test_bit(GLF_LOCK, &gl->gl_flags)) return; set_bit(GLF_LOCK, &gl->gl_flags); GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); if (test_bit(GLF_DEMOTE, &gl->gl_flags) && gl->gl_demote_state != gl->gl_state) { if (find_first_holder(gl)) goto out_unlock; if (nonblock) goto out_sched; set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); gl->gl_target = gl->gl_demote_state; } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); if (do_promote(gl)) goto out_unlock; gh = find_first_waiter(gl); gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) do_error(gl, 0); /* Fail queued try locks */ } do_xmote(gl, gh, gl->gl_target); return; out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); smp_mb__after_atomic(); gl->gl_lockref.count++; gfs2_glock_queue_work(gl, 0); return; out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); smp_mb__after_atomic(); } /** * glock_set_object - set the gl_object field of a glock * @gl: the glock * @object: the object */ void glock_set_object(struct gfs2_glock *gl, void *object) { void *prev_object; spin_lock(&gl->gl_lockref.lock); prev_object = gl->gl_object; gl->gl_object = object; spin_unlock(&gl->gl_lockref.lock); if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) { pr_warn("glock=%u/%llx\n", gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number); gfs2_dump_glock(NULL, gl, true); } } /** * glock_clear_object - clear the gl_object field of a glock * @gl: the glock * @object: object the glock currently points at */ void glock_clear_object(struct gfs2_glock *gl, void *object) { void *prev_object; spin_lock(&gl->gl_lockref.lock); prev_object = gl->gl_object; gl->gl_object = NULL; spin_unlock(&gl->gl_lockref.lock); if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) { pr_warn("glock=%u/%llx\n", gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number); gfs2_dump_glock(NULL, gl, true); } } void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation) { struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; if (ri->ri_magic == 0) ri->ri_magic = cpu_to_be32(GFS2_MAGIC); if (ri->ri_magic == cpu_to_be32(GFS2_MAGIC)) ri->ri_generation_deleted = cpu_to_be64(generation); } bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation) { struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; if (ri->ri_magic != cpu_to_be32(GFS2_MAGIC)) return false; return generation <= be64_to_cpu(ri->ri_generation_deleted); } static void gfs2_glock_poke(struct gfs2_glock *gl) { int flags = LM_FLAG_TRY_1CB | LM_FLAG_ANY | GL_SKIP; struct gfs2_holder gh; int error; __gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh, _RET_IP_); error = gfs2_glock_nq(&gh); if (!error) gfs2_glock_dq(&gh); gfs2_holder_uninit(&gh); } static void gfs2_try_evict(struct gfs2_glock *gl) { struct gfs2_inode *ip; /* * If there is contention on the iopen glock and we have an inode, try * to grab and release the inode so that it can be evicted. The * GIF_DEFER_DELETE flag indicates to gfs2_evict_inode() that the inode * should not be deleted locally. This will allow the remote node to * go ahead and delete the inode without us having to do it, which will * avoid rgrp glock thrashing. * * The remote node is likely still holding the corresponding inode * glock, so it will run before we get to verify that the delete has * happened below. (Verification is triggered by the call to * gfs2_queue_verify_delete() in gfs2_evict_inode().) */ spin_lock(&gl->gl_lockref.lock); ip = gl->gl_object; if (ip && !igrab(&ip->i_inode)) ip = NULL; spin_unlock(&gl->gl_lockref.lock); if (ip) { wait_on_inode(&ip->i_inode); if (is_bad_inode(&ip->i_inode)) { iput(&ip->i_inode); ip = NULL; } } if (ip) { set_bit(GIF_DEFER_DELETE, &ip->i_flags); d_prune_aliases(&ip->i_inode); iput(&ip->i_inode); /* If the inode was evicted, gl->gl_object will now be NULL. */ spin_lock(&gl->gl_lockref.lock); ip = gl->gl_object; if (ip) { clear_bit(GIF_DEFER_DELETE, &ip->i_flags); if (!igrab(&ip->i_inode)) ip = NULL; } spin_unlock(&gl->gl_lockref.lock); if (ip) { gfs2_glock_poke(ip->i_gl); iput(&ip->i_inode); } } } bool gfs2_queue_try_to_evict(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (test_and_set_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) return false; return !mod_delayed_work(sdp->sd_delete_wq, &gl->gl_delete, 0); } bool gfs2_queue_verify_delete(struct gfs2_glock *gl, bool later) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; unsigned long delay; if (test_and_set_bit(GLF_VERIFY_DELETE, &gl->gl_flags)) return false; delay = later ? HZ + get_random_long() % (HZ * 9) : 0; return queue_delayed_work(sdp->sd_delete_wq, &gl->gl_delete, delay); } static void delete_work_func(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct gfs2_glock *gl = container_of(dwork, struct gfs2_glock, gl_delete); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; bool verify_delete = test_and_clear_bit(GLF_VERIFY_DELETE, &gl->gl_flags); if (test_and_clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) gfs2_try_evict(gl); if (verify_delete) { u64 no_addr = gl->gl_name.ln_number; struct inode *inode; inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, GFS2_BLKST_UNLINKED); if (IS_ERR(inode)) { if (PTR_ERR(inode) == -EAGAIN && !test_bit(SDF_KILL, &sdp->sd_flags) && gfs2_queue_verify_delete(gl, true)) return; } else { d_prune_aliases(inode); iput(inode); } } gfs2_glock_put(gl); } static void glock_work_func(struct work_struct *work) { unsigned long delay = 0; struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); unsigned int drop_refs = 1; spin_lock(&gl->gl_lockref.lock); if (test_bit(GLF_HAVE_REPLY, &gl->gl_flags)) { clear_bit(GLF_HAVE_REPLY, &gl->gl_flags); finish_xmote(gl, gl->gl_reply); drop_refs++; } if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && gl->gl_state != LM_ST_UNLOCKED && gl->gl_demote_state != LM_ST_EXCLUSIVE) { if (gl->gl_name.ln_type == LM_TYPE_INODE) { unsigned long holdtime, now = jiffies; holdtime = gl->gl_tchange + gl->gl_hold_time; if (time_before(now, holdtime)) delay = holdtime - now; } if (!delay) { clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); gfs2_set_demote(GLF_DEMOTE, gl); } } run_queue(gl, 0); if (delay) { /* Keep one glock reference for the work we requeue. */ drop_refs--; gfs2_glock_queue_work(gl, delay); } /* Drop the remaining glock references manually. */ GLOCK_BUG_ON(gl, gl->gl_lockref.count < drop_refs); gl->gl_lockref.count -= drop_refs; if (!gl->gl_lockref.count) { if (gl->gl_state == LM_ST_UNLOCKED) { __gfs2_glock_put(gl); return; } gfs2_glock_add_to_lru(gl); } spin_unlock(&gl->gl_lockref.lock); } static struct gfs2_glock *find_insert_glock(struct lm_lockname *name, struct gfs2_glock *new) { struct wait_glock_queue wait; wait_queue_head_t *wq = glock_waitqueue(name); struct gfs2_glock *gl; wait.name = name; init_wait(&wait.wait); wait.wait.func = glock_wake_function; again: prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); rcu_read_lock(); if (new) { gl = rhashtable_lookup_get_insert_fast(&gl_hash_table, &new->gl_node, ht_parms); if (IS_ERR(gl)) goto out; } else { gl = rhashtable_lookup_fast(&gl_hash_table, name, ht_parms); } if (gl && !lockref_get_not_dead(&gl->gl_lockref)) { rcu_read_unlock(); schedule(); goto again; } out: rcu_read_unlock(); finish_wait(wq, &wait.wait); if (gl) gfs2_glock_remove_from_lru(gl); return gl; } /** * gfs2_glock_get() - Get a glock, or create one if one doesn't exist * @sdp: The GFS2 superblock * @number: the lock number * @glops: The glock_operations to use * @create: If 0, don't create the glock if it doesn't exist * @glp: the glock is returned here * * This does not lock a glock, just finds/creates structures for one. * * Returns: errno */ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp) { struct super_block *s = sdp->sd_vfs; struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type, .ln_sbd = sdp }; struct gfs2_glock *gl, *tmp; struct address_space *mapping; gl = find_insert_glock(&name, NULL); if (gl) goto found; if (!create) return -ENOENT; if (glops->go_flags & GLOF_ASPACE) { struct gfs2_glock_aspace *gla = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_NOFS); if (!gla) return -ENOMEM; gl = &gla->glock; } else { gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_NOFS); if (!gl) return -ENOMEM; } memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); gl->gl_ops = glops; if (glops->go_flags & GLOF_LVB) { gl->gl_lksb.sb_lvbptr = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); if (!gl->gl_lksb.sb_lvbptr) { gfs2_glock_dealloc(&gl->gl_rcu); return -ENOMEM; } } atomic_inc(&sdp->sd_glock_disposal); gl->gl_node.next = NULL; gl->gl_flags = BIT(GLF_INITIAL); if (glops->go_instantiate) gl->gl_flags |= BIT(GLF_INSTANTIATE_NEEDED); gl->gl_name = name; lockref_init(&gl->gl_lockref); lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass); gl->gl_state = LM_ST_UNLOCKED; gl->gl_target = LM_ST_UNLOCKED; gl->gl_demote_state = LM_ST_EXCLUSIVE; gl->gl_dstamp = 0; preempt_disable(); /* We use the global stats to estimate the initial per-glock stats */ gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type]; preempt_enable(); gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0; gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0; gl->gl_tchange = jiffies; gl->gl_object = NULL; gl->gl_hold_time = GL_GLOCK_DFT_HOLD; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); if (gl->gl_name.ln_type == LM_TYPE_IOPEN) INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func); mapping = gfs2_glock2aspace(gl); if (mapping) { mapping->a_ops = &gfs2_meta_aops; mapping->host = s->s_bdev->bd_mapping->host; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->i_private_data = NULL; mapping->writeback_index = 0; } tmp = find_insert_glock(&name, gl); if (tmp) { gfs2_glock_dealloc(&gl->gl_rcu); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_kill_wait); if (IS_ERR(tmp)) return PTR_ERR(tmp); gl = tmp; } found: *glp = gl; return 0; } /** * __gfs2_holder_init - initialize a struct gfs2_holder in the default way * @gl: the glock * @state: the state we're requesting * @flags: the modifier flags * @gh: the holder structure * */ void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, struct gfs2_holder *gh, unsigned long ip) { INIT_LIST_HEAD(&gh->gh_list); gh->gh_gl = gfs2_glock_hold(gl); gh->gh_ip = ip; gh->gh_owner_pid = get_pid(task_pid(current)); gh->gh_state = state; gh->gh_flags = flags; gh->gh_iflags = 0; } /** * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it * @state: the state we're requesting * @flags: the modifier flags * @gh: the holder structure * * Don't mess with the glock. * */ void gfs2_holder_reinit(unsigned int state, u16 flags, struct gfs2_holder *gh) { gh->gh_state = state; gh->gh_flags = flags; gh->gh_iflags = 0; gh->gh_ip = _RET_IP_; put_pid(gh->gh_owner_pid); gh->gh_owner_pid = get_pid(task_pid(current)); } /** * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference) * @gh: the holder structure * */ void gfs2_holder_uninit(struct gfs2_holder *gh) { put_pid(gh->gh_owner_pid); gfs2_glock_put(gh->gh_gl); gfs2_holder_mark_uninitialized(gh); gh->gh_ip = 0; } static void gfs2_glock_update_hold_time(struct gfs2_glock *gl, unsigned long start_time) { /* Have we waited longer that a second? */ if (time_after(jiffies, start_time + HZ)) { /* Lengthen the minimum hold time. */ gl->gl_hold_time = min(gl->gl_hold_time + GL_GLOCK_HOLD_INCR, GL_GLOCK_MAX_HOLD); } } /** * gfs2_glock_holder_ready - holder is ready and its error code can be collected * @gh: the glock holder * * Called when a glock holder no longer needs to be waited for because it is * now either held (HIF_HOLDER set; gh_error == 0), or acquiring the lock has * failed (gh_error != 0). */ int gfs2_glock_holder_ready(struct gfs2_holder *gh) { if (gh->gh_error || (gh->gh_flags & GL_SKIP)) return gh->gh_error; gh->gh_error = gfs2_instantiate(gh); if (gh->gh_error) gfs2_glock_dq(gh); return gh->gh_error; } /** * gfs2_glock_wait - wait on a glock acquisition * @gh: the glock holder * * Returns: 0 on success */ int gfs2_glock_wait(struct gfs2_holder *gh) { unsigned long start_time = jiffies; might_sleep(); wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); gfs2_glock_update_hold_time(gh->gh_gl, start_time); return gfs2_glock_holder_ready(gh); } static int glocks_pending(unsigned int num_gh, struct gfs2_holder *ghs) { int i; for (i = 0; i < num_gh; i++) if (test_bit(HIF_WAIT, &ghs[i].gh_iflags)) return 1; return 0; } /** * gfs2_glock_async_wait - wait on multiple asynchronous glock acquisitions * @num_gh: the number of holders in the array * @ghs: the glock holder array * * Returns: 0 on success, meaning all glocks have been granted and are held. * -ESTALE if the request timed out, meaning all glocks were released, * and the caller should retry the operation. */ int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs) { struct gfs2_sbd *sdp = ghs[0].gh_gl->gl_name.ln_sbd; int i, ret = 0, timeout = 0; unsigned long start_time = jiffies; might_sleep(); /* * Total up the (minimum hold time * 2) of all glocks and use that to * determine the max amount of time we should wait. */ for (i = 0; i < num_gh; i++) timeout += ghs[i].gh_gl->gl_hold_time << 1; if (!wait_event_timeout(sdp->sd_async_glock_wait, !glocks_pending(num_gh, ghs), timeout)) { ret = -ESTALE; /* request timed out. */ goto out; } for (i = 0; i < num_gh; i++) { struct gfs2_holder *gh = &ghs[i]; int ret2; if (test_bit(HIF_HOLDER, &gh->gh_iflags)) { gfs2_glock_update_hold_time(gh->gh_gl, start_time); } ret2 = gfs2_glock_holder_ready(gh); if (!ret) ret = ret2; } out: if (ret) { for (i = 0; i < num_gh; i++) { struct gfs2_holder *gh = &ghs[i]; gfs2_glock_dq(gh); } } return ret; } /** * request_demote - process a demote request * @gl: the glock * @state: the state the caller wants us to change to * @delay: zero to demote immediately; otherwise pending demote * @remote: true if this came from a different cluster node * * There are only two requests that we are going to see in actual * practise: LM_ST_SHARED and LM_ST_UNLOCKED */ static void request_demote(struct gfs2_glock *gl, unsigned int state, unsigned long delay, bool remote) { gfs2_set_demote(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, gl); if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { gl->gl_demote_state = state; gl->gl_demote_time = jiffies; } else if (gl->gl_demote_state != LM_ST_UNLOCKED && gl->gl_demote_state != state) { gl->gl_demote_state = LM_ST_UNLOCKED; } if (gl->gl_ops->go_callback) gl->gl_ops->go_callback(gl, remote); trace_gfs2_demote_rq(gl, remote); } void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); if (seq) { seq_vprintf(seq, fmt, args); } else { vaf.fmt = fmt; vaf.va = &args; pr_err("%pV", &vaf); } va_end(args); } static inline bool pid_is_meaningful(const struct gfs2_holder *gh) { if (!(gh->gh_flags & GL_NOPID)) return true; if (gh->gh_state == LM_ST_UNLOCKED) return true; return false; } /** * add_to_queue - Add a holder to the wait queue (but look for recursion) * @gh: the holder structure to add * * Eventually we should move the recursive locking trap to a * debugging option or something like that. This is the fast * path and needs to have the minimum number of distractions. * */ static inline void add_to_queue(struct gfs2_holder *gh) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct list_head *insert_pt = NULL; struct gfs2_holder *gh2; int try_futile = 0; GLOCK_BUG_ON(gl, gh->gh_owner_pid == NULL); if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) GLOCK_BUG_ON(gl, true); if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { if (test_bit(GLF_LOCK, &gl->gl_flags)) { struct gfs2_holder *current_gh; current_gh = find_first_holder(gl); try_futile = !may_grant(gl, current_gh, gh); } if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) goto fail; } list_for_each_entry(gh2, &gl->gl_holders, gh_list) { if (likely(gh2->gh_owner_pid != gh->gh_owner_pid)) continue; if (gh->gh_gl->gl_ops->go_type == LM_TYPE_FLOCK) continue; if (!pid_is_meaningful(gh2)) continue; goto trap_recursive; } list_for_each_entry(gh2, &gl->gl_holders, gh_list) { if (try_futile && !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { fail: gh->gh_error = GLR_TRYFAILED; gfs2_holder_wake(gh); return; } if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) continue; } trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); if (likely(insert_pt == NULL)) { list_add_tail(&gh->gh_list, &gl->gl_holders); return; } list_add_tail(&gh->gh_list, insert_pt); spin_unlock(&gl->gl_lockref.lock); if (sdp->sd_lockstruct.ls_ops->lm_cancel) sdp->sd_lockstruct.ls_ops->lm_cancel(gl); spin_lock(&gl->gl_lockref.lock); return; trap_recursive: fs_err(sdp, "original: %pSR\n", (void *)gh2->gh_ip); fs_err(sdp, "pid: %d\n", pid_nr(gh2->gh_owner_pid)); fs_err(sdp, "lock type: %d req lock state : %d\n", gh2->gh_gl->gl_name.ln_type, gh2->gh_state); fs_err(sdp, "new: %pSR\n", (void *)gh->gh_ip); fs_err(sdp, "pid: %d\n", pid_nr(gh->gh_owner_pid)); fs_err(sdp, "lock type: %d req lock state : %d\n", gh->gh_gl->gl_name.ln_type, gh->gh_state); gfs2_dump_glock(NULL, gl, true); BUG(); } /** * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock) * @gh: the holder structure * * if (gh->gh_flags & GL_ASYNC), this never returns an error * * Returns: 0, GLR_TRYFAILED, or errno on failure */ int gfs2_glock_nq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; int error; if (glock_blocked_by_withdraw(gl) && !(gh->gh_flags & LM_FLAG_NOEXP)) return -EIO; if (gh->gh_flags & GL_NOBLOCK) { struct gfs2_holder *current_gh; error = -ECHILD; spin_lock(&gl->gl_lockref.lock); if (find_last_waiter(gl)) goto unlock; current_gh = find_first_holder(gl); if (!may_grant(gl, current_gh, gh)) goto unlock; set_bit(HIF_HOLDER, &gh->gh_iflags); list_add_tail(&gh->gh_list, &gl->gl_holders); trace_gfs2_promote(gh); error = 0; unlock: spin_unlock(&gl->gl_lockref.lock); return error; } gh->gh_error = 0; spin_lock(&gl->gl_lockref.lock); add_to_queue(gh); if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) && test_and_clear_bit(GLF_HAVE_FROZEN_REPLY, &gl->gl_flags))) { set_bit(GLF_HAVE_REPLY, &gl->gl_flags); gl->gl_lockref.count++; gfs2_glock_queue_work(gl, 0); } run_queue(gl, 1); spin_unlock(&gl->gl_lockref.lock); error = 0; if (!(gh->gh_flags & GL_ASYNC)) error = gfs2_glock_wait(gh); return error; } /** * gfs2_glock_poll - poll to see if an async request has been completed * @gh: the holder * * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on */ int gfs2_glock_poll(struct gfs2_holder *gh) { return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1; } static void __gfs2_glock_dq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; unsigned delay = 0; int fast_path = 0; /* * This holder should not be cached, so mark it for demote. * Note: this should be done before the glock_needs_demote * check below. */ if (gh->gh_flags & GL_NOCACHE) request_demote(gl, LM_ST_UNLOCKED, 0, false); list_del_init(&gh->gh_list); clear_bit(HIF_HOLDER, &gh->gh_iflags); trace_gfs2_glock_queue(gh, 0); /* * If there hasn't been a demote request we are done. * (Let the remaining holders, if any, keep holding it.) */ if (!glock_needs_demote(gl)) { if (list_empty(&gl->gl_holders)) fast_path = 1; } if (unlikely(!fast_path)) { gl->gl_lockref.count++; if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && !test_bit(GLF_DEMOTE, &gl->gl_flags) && gl->gl_name.ln_type == LM_TYPE_INODE) delay = gl->gl_hold_time; gfs2_glock_queue_work(gl, delay); } } /** * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) * @gh: the glock holder * */ void gfs2_glock_dq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; spin_lock(&gl->gl_lockref.lock); if (!gfs2_holder_queued(gh)) { /* * May have already been dequeued because the locking request * was GL_ASYNC and it has failed in the meantime. */ goto out; } if (list_is_first(&gh->gh_list, &gl->gl_holders) && !test_bit(HIF_HOLDER, &gh->gh_iflags)) { spin_unlock(&gl->gl_lockref.lock); gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl); wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); spin_lock(&gl->gl_lockref.lock); } /* * If we're in the process of file system withdraw, we cannot just * dequeue any glocks until our journal is recovered, lest we introduce * file system corruption. We need two exceptions to this rule: We need * to allow unlocking of nondisk glocks and the glock for our own * journal that needs recovery. */ if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && glock_blocked_by_withdraw(gl) && gh->gh_gl != sdp->sd_jinode_gl) { sdp->sd_glock_dqs_held++; spin_unlock(&gl->gl_lockref.lock); might_sleep(); wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, TASK_UNINTERRUPTIBLE); spin_lock(&gl->gl_lockref.lock); } __gfs2_glock_dq(gh); out: spin_unlock(&gl->gl_lockref.lock); } void gfs2_glock_dq_wait(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; gfs2_glock_dq(gh); might_sleep(); wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE); } /** * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it * @gh: the holder structure * */ void gfs2_glock_dq_uninit(struct gfs2_holder *gh) { gfs2_glock_dq(gh); gfs2_holder_uninit(gh); } /** * gfs2_glock_nq_num - acquire a glock based on lock number * @sdp: the filesystem * @number: the lock number * @glops: the glock operations for the type of glock * @state: the state to acquire the glock in * @flags: modifier flags for the acquisition * @gh: the struct gfs2_holder * * Returns: errno */ int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, unsigned int state, u16 flags, struct gfs2_holder *gh) { struct gfs2_glock *gl; int error; error = gfs2_glock_get(sdp, number, glops, CREATE, &gl); if (!error) { error = gfs2_glock_nq_init(gl, state, flags, gh); gfs2_glock_put(gl); } return error; } /** * glock_compare - Compare two struct gfs2_glock structures for sorting * @arg_a: the first structure * @arg_b: the second structure * */ static int glock_compare(const void *arg_a, const void *arg_b) { const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a; const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b; const struct lm_lockname *a = &gh_a->gh_gl->gl_name; const struct lm_lockname *b = &gh_b->gh_gl->gl_name; if (a->ln_number > b->ln_number) return 1; if (a->ln_number < b->ln_number) return -1; BUG_ON(gh_a->gh_gl->gl_ops->go_type == gh_b->gh_gl->gl_ops->go_type); return 0; } /** * nq_m_sync - synchronously acquire more than one glock in deadlock free order * @num_gh: the number of structures * @ghs: an array of struct gfs2_holder structures * @p: placeholder for the holder structure to pass back * * Returns: 0 on success (all glocks acquired), * errno on failure (no glocks acquired) */ static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs, struct gfs2_holder **p) { unsigned int x; int error = 0; for (x = 0; x < num_gh; x++) p[x] = &ghs[x]; sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL); for (x = 0; x < num_gh; x++) { error = gfs2_glock_nq(p[x]); if (error) { while (x--) gfs2_glock_dq(p[x]); break; } } return error; } /** * gfs2_glock_nq_m - acquire multiple glocks * @num_gh: the number of structures * @ghs: an array of struct gfs2_holder structures * * Returns: 0 on success (all glocks acquired), * errno on failure (no glocks acquired) */ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) { struct gfs2_holder *tmp[4]; struct gfs2_holder **pph = tmp; int error = 0; switch(num_gh) { case 0: return 0; case 1: return gfs2_glock_nq(ghs); default: if (num_gh <= 4) break; pph = kmalloc_array(num_gh, sizeof(struct gfs2_holder *), GFP_NOFS); if (!pph) return -ENOMEM; } error = nq_m_sync(num_gh, ghs, pph); if (pph != tmp) kfree(pph); return error; } /** * gfs2_glock_dq_m - release multiple glocks * @num_gh: the number of structures * @ghs: an array of struct gfs2_holder structures * */ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) { while (num_gh--) gfs2_glock_dq(&ghs[num_gh]); } void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) { unsigned long delay = 0; gfs2_glock_hold(gl); spin_lock(&gl->gl_lockref.lock); if (!list_empty(&gl->gl_holders) && gl->gl_name.ln_type == LM_TYPE_INODE) { unsigned long now = jiffies; unsigned long holdtime; holdtime = gl->gl_tchange + gl->gl_hold_time; if (time_before(now, holdtime)) delay = holdtime - now; if (test_bit(GLF_HAVE_REPLY, &gl->gl_flags)) delay = gl->gl_hold_time; } request_demote(gl, state, delay, true); gfs2_glock_queue_work(gl, delay); spin_unlock(&gl->gl_lockref.lock); } /** * gfs2_should_freeze - Figure out if glock should be frozen * @gl: The glock in question * * Glocks are not frozen if (a) the result of the dlm operation is * an error, (b) the locking operation was an unlock operation or * (c) if there is a "noexp" flagged request anywhere in the queue * * Returns: 1 if freezing should occur, 0 otherwise */ static int gfs2_should_freeze(const struct gfs2_glock *gl) { const struct gfs2_holder *gh; if (gl->gl_reply & ~LM_OUT_ST_MASK) return 0; if (gl->gl_target == LM_ST_UNLOCKED) return 0; list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; if (LM_FLAG_NOEXP & gh->gh_flags) return 0; } return 1; } /** * gfs2_glock_complete - Callback used by locking * @gl: Pointer to the glock * @ret: The return value from the dlm * * The gl_reply field is under the gl_lockref.lock lock so that it is ok * to use a bitfield shared with other glock state fields. */ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) { struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; spin_lock(&gl->gl_lockref.lock); gl->gl_reply = ret; if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) { if (gfs2_should_freeze(gl)) { set_bit(GLF_HAVE_FROZEN_REPLY, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); return; } } gl->gl_lockref.count++; set_bit(GLF_HAVE_REPLY, &gl->gl_flags); gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); } static int glock_cmp(void *priv, const struct list_head *a, const struct list_head *b) { struct gfs2_glock *gla, *glb; gla = list_entry(a, struct gfs2_glock, gl_lru); glb = list_entry(b, struct gfs2_glock, gl_lru); if (gla->gl_name.ln_number > glb->gl_name.ln_number) return 1; if (gla->gl_name.ln_number < glb->gl_name.ln_number) return -1; return 0; } static bool can_free_glock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; return !test_bit(GLF_LOCK, &gl->gl_flags) && !gl->gl_lockref.count && (!test_bit(GLF_LFLUSH, &gl->gl_flags) || test_bit(SDF_KILL, &sdp->sd_flags)); } /** * gfs2_dispose_glock_lru - Demote a list of glocks * @list: The list to dispose of * * Disposing of glocks may involve disk accesses, so that here we sort * the glocks by number (i.e. disk location of the inodes) so that if * there are any such accesses, they'll be sent in order (mostly). * * Must be called under the lru_lock, but may drop and retake this * lock. While the lru_lock is dropped, entries may vanish from the * list, but no new entries will appear on the list (since it is * private) */ static unsigned long gfs2_dispose_glock_lru(struct list_head *list) __releases(&lru_lock) __acquires(&lru_lock) { struct gfs2_glock *gl; unsigned long freed = 0; list_sort(NULL, list, glock_cmp); while(!list_empty(list)) { gl = list_first_entry(list, struct gfs2_glock, gl_lru); if (!spin_trylock(&gl->gl_lockref.lock)) { add_back_to_lru: list_move(&gl->gl_lru, &lru_list); continue; } if (!can_free_glock(gl)) { spin_unlock(&gl->gl_lockref.lock); goto add_back_to_lru; } list_del_init(&gl->gl_lru); atomic_dec(&lru_count); clear_bit(GLF_LRU, &gl->gl_flags); freed++; gl->gl_lockref.count++; if (gl->gl_state != LM_ST_UNLOCKED) request_demote(gl, LM_ST_UNLOCKED, 0, false); gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); cond_resched_lock(&lru_lock); } return freed; } /** * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote * @nr: The number of entries to scan * * This function selects the entries on the LRU which are able to * be demoted, and then kicks off the process by calling * gfs2_dispose_glock_lru() above. */ static unsigned long gfs2_scan_glock_lru(unsigned long nr) { struct gfs2_glock *gl, *next; LIST_HEAD(dispose); unsigned long freed = 0; spin_lock(&lru_lock); list_for_each_entry_safe(gl, next, &lru_list, gl_lru) { if (!nr--) break; if (can_free_glock(gl)) list_move(&gl->gl_lru, &dispose); } if (!list_empty(&dispose)) freed = gfs2_dispose_glock_lru(&dispose); spin_unlock(&lru_lock); return freed; } static unsigned long gfs2_glock_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; return gfs2_scan_glock_lru(sc->nr_to_scan); } static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return vfs_pressure_ratio(atomic_read(&lru_count)); } static struct shrinker *glock_shrinker; /** * glock_hash_walk - Call a function for glock in a hash bucket * @examiner: the function * @sdp: the filesystem * * Note that the function can be called multiple times on the same * object. So the user must ensure that the function can cope with * that. */ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) { struct gfs2_glock *gl; struct rhashtable_iter iter; rhashtable_walk_enter(&gl_hash_table, &iter); do { rhashtable_walk_start(&iter); while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) { if (gl->gl_name.ln_sbd == sdp) examiner(gl); } rhashtable_walk_stop(&iter); } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); rhashtable_walk_exit(&iter); } void gfs2_cancel_delete_work(struct gfs2_glock *gl) { clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags); clear_bit(GLF_VERIFY_DELETE, &gl->gl_flags); if (cancel_delayed_work(&gl->gl_delete)) gfs2_glock_put(gl); } static void flush_delete_work(struct gfs2_glock *gl) { if (gl->gl_name.ln_type == LM_TYPE_IOPEN) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; if (cancel_delayed_work(&gl->gl_delete)) { queue_delayed_work(sdp->sd_delete_wq, &gl->gl_delete, 0); } } } void gfs2_flush_delete_work(struct gfs2_sbd *sdp) { glock_hash_walk(flush_delete_work, sdp); flush_workqueue(sdp->sd_delete_wq); } /** * thaw_glock - thaw out a glock which has an unprocessed reply waiting * @gl: The glock to thaw * */ static void thaw_glock(struct gfs2_glock *gl) { if (!test_and_clear_bit(GLF_HAVE_FROZEN_REPLY, &gl->gl_flags)) return; if (!lockref_get_not_dead(&gl->gl_lockref)) return; gfs2_glock_remove_from_lru(gl); spin_lock(&gl->gl_lockref.lock); set_bit(GLF_HAVE_REPLY, &gl->gl_flags); gfs2_glock_queue_work(gl, 0); spin_unlock(&gl->gl_lockref.lock); } /** * clear_glock - look at a glock and see if we can free it from glock cache * @gl: the glock to look at * */ static void clear_glock(struct gfs2_glock *gl) { gfs2_glock_remove_from_lru(gl); spin_lock(&gl->gl_lockref.lock); if (!__lockref_is_dead(&gl->gl_lockref)) { gl->gl_lockref.count++; if (gl->gl_state != LM_ST_UNLOCKED) request_demote(gl, LM_ST_UNLOCKED, 0, false); gfs2_glock_queue_work(gl, 0); } spin_unlock(&gl->gl_lockref.lock); } /** * gfs2_glock_thaw - Thaw any frozen glocks * @sdp: The super block * */ void gfs2_glock_thaw(struct gfs2_sbd *sdp) { glock_hash_walk(thaw_glock, sdp); } static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid) { spin_lock(&gl->gl_lockref.lock); gfs2_dump_glock(seq, gl, fsid); spin_unlock(&gl->gl_lockref.lock); } static void dump_glock_func(struct gfs2_glock *gl) { dump_glock(NULL, gl, true); } static void withdraw_dq(struct gfs2_glock *gl) { spin_lock(&gl->gl_lockref.lock); if (!__lockref_is_dead(&gl->gl_lockref) && glock_blocked_by_withdraw(gl)) do_error(gl, LM_OUT_ERROR); /* remove pending waiters */ spin_unlock(&gl->gl_lockref.lock); } void gfs2_gl_dq_holders(struct gfs2_sbd *sdp) { glock_hash_walk(withdraw_dq, sdp); } /** * gfs2_gl_hash_clear - Empty out the glock hash table * @sdp: the filesystem * * Called when unmounting the filesystem. */ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) { unsigned long start = jiffies; bool timed_out = false; set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); flush_workqueue(sdp->sd_glock_wq); glock_hash_walk(clear_glock, sdp); flush_workqueue(sdp->sd_glock_wq); while (!timed_out) { wait_event_timeout(sdp->sd_kill_wait, !atomic_read(&sdp->sd_glock_disposal), HZ * 60); if (!atomic_read(&sdp->sd_glock_disposal)) break; timed_out = time_after(jiffies, start + (HZ * 600)); fs_warn(sdp, "%u glocks left after %u seconds%s\n", atomic_read(&sdp->sd_glock_disposal), jiffies_to_msecs(jiffies - start) / 1000, timed_out ? ":" : "; still waiting"); } gfs2_lm_unmount(sdp); gfs2_free_dead_glocks(sdp); glock_hash_walk(dump_glock_func, sdp); destroy_workqueue(sdp->sd_glock_wq); sdp->sd_glock_wq = NULL; } static const char *state2str(unsigned state) { switch(state) { case LM_ST_UNLOCKED: return "UN"; case LM_ST_SHARED: return "SH"; case LM_ST_DEFERRED: return "DF"; case LM_ST_EXCLUSIVE: return "EX"; } return "??"; } static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) { char *p = buf; if (flags & LM_FLAG_TRY) *p++ = 't'; if (flags & LM_FLAG_TRY_1CB) *p++ = 'T'; if (flags & LM_FLAG_NOEXP) *p++ = 'e'; if (flags & LM_FLAG_ANY) *p++ = 'A'; if (flags & LM_FLAG_NODE_SCOPE) *p++ = 'n'; if (flags & GL_ASYNC) *p++ = 'a'; if (flags & GL_EXACT) *p++ = 'E'; if (flags & GL_NOCACHE) *p++ = 'c'; if (test_bit(HIF_HOLDER, &iflags)) *p++ = 'H'; if (test_bit(HIF_WAIT, &iflags)) *p++ = 'W'; if (flags & GL_SKIP) *p++ = 's'; *p = 0; return buf; } /** * dump_holder - print information about a glock holder * @seq: the seq_file struct * @gh: the glock holder * @fs_id_buf: pointer to file system id (if requested) * */ static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh, const char *fs_id_buf) { const char *comm = "(none)"; pid_t owner_pid = 0; char flags_buf[32]; rcu_read_lock(); if (pid_is_meaningful(gh)) { struct task_struct *gh_owner; comm = "(ended)"; owner_pid = pid_nr(gh->gh_owner_pid); gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); if (gh_owner) comm = gh_owner->comm; } gfs2_print_dbg(seq, "%s H: s:%s f:%s e:%d p:%ld [%s] %pS\n", fs_id_buf, state2str(gh->gh_state), hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), gh->gh_error, (long)owner_pid, comm, (void *)gh->gh_ip); rcu_read_unlock(); } static const char *gflags2str(char *buf, const struct gfs2_glock *gl) { const unsigned long *gflags = &gl->gl_flags; char *p = buf; if (test_bit(GLF_LOCK, gflags)) *p++ = 'l'; if (test_bit(GLF_DEMOTE, gflags)) *p++ = 'D'; if (test_bit(GLF_PENDING_DEMOTE, gflags)) *p++ = 'd'; if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags)) *p++ = 'p'; if (test_bit(GLF_DIRTY, gflags)) *p++ = 'y'; if (test_bit(GLF_LFLUSH, gflags)) *p++ = 'f'; if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags)) *p++ = 'i'; if (test_bit(GLF_HAVE_REPLY, gflags)) *p++ = 'r'; if (test_bit(GLF_INITIAL, gflags)) *p++ = 'a'; if (test_bit(GLF_HAVE_FROZEN_REPLY, gflags)) *p++ = 'F'; if (!list_empty(&gl->gl_holders)) *p++ = 'q'; if (test_bit(GLF_LRU, gflags)) *p++ = 'L'; if (gl->gl_object) *p++ = 'o'; if (test_bit(GLF_BLOCKING, gflags)) *p++ = 'b'; if (test_bit(GLF_UNLOCKED, gflags)) *p++ = 'x'; if (test_bit(GLF_INSTANTIATE_NEEDED, gflags)) *p++ = 'n'; if (test_bit(GLF_INSTANTIATE_IN_PROG, gflags)) *p++ = 'N'; if (test_bit(GLF_TRY_TO_EVICT, gflags)) *p++ = 'e'; if (test_bit(GLF_VERIFY_DELETE, gflags)) *p++ = 'E'; *p = 0; return buf; } /** * gfs2_dump_glock - print information about a glock * @seq: The seq_file struct * @gl: the glock * @fsid: If true, also dump the file system id * * The file format is as follows: * One line per object, capital letters are used to indicate objects * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented, * other objects are indented by a single space and follow the glock to * which they are related. Fields are indicated by lower case letters * followed by a colon and the field value, except for strings which are in * [] so that its possible to see if they are composed of spaces for * example. The field's are n = number (id of the object), f = flags, * t = type, s = state, r = refcount, e = error, p = pid. * */ void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid) { const struct gfs2_glock_operations *glops = gl->gl_ops; unsigned long long dtime; const struct gfs2_holder *gh; char gflags_buf[32]; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; char fs_id_buf[sizeof(sdp->sd_fsname) + 7]; unsigned long nrpages = 0; if (gl->gl_ops->go_flags & GLOF_ASPACE) { struct address_space *mapping = gfs2_glock2aspace(gl); nrpages = mapping->nrpages; } memset(fs_id_buf, 0, sizeof(fs_id_buf)); if (fsid && sdp) /* safety precaution */ sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); dtime = jiffies - gl->gl_demote_time; dtime *= 1000000/HZ; /* demote time in uSec */ if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) dtime = 0; gfs2_print_dbg(seq, "%sG: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d " "v:%d r:%d m:%ld p:%lu\n", fs_id_buf, state2str(gl->gl_state), gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, gflags2str(gflags_buf, gl), state2str(gl->gl_target), state2str(gl->gl_demote_state), dtime, atomic_read(&gl->gl_ail_count), atomic_read(&gl->gl_revokes), (int)gl->gl_lockref.count, gl->gl_hold_time, nrpages); list_for_each_entry(gh, &gl->gl_holders, gh_list) dump_holder(seq, gh, fs_id_buf); if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) glops->go_dump(seq, gl, fs_id_buf); } static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) { struct gfs2_glock *gl = iter_ptr; seq_printf(seq, "G: n:%u/%llx rtt:%llu/%llu rttb:%llu/%llu irt:%llu/%llu dcnt: %llu qcnt: %llu\n", gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number, (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTT], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTB], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRT], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT], (unsigned long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]); return 0; } static const char *gfs2_gltype[] = { "type", "reserved", "nondisk", "inode", "rgrp", "meta", "iopen", "flock", "plock", "quota", "journal", }; static const char *gfs2_stype[] = { [GFS2_LKS_SRTT] = "srtt", [GFS2_LKS_SRTTVAR] = "srttvar", [GFS2_LKS_SRTTB] = "srttb", [GFS2_LKS_SRTTVARB] = "srttvarb", [GFS2_LKS_SIRT] = "sirt", [GFS2_LKS_SIRTVAR] = "sirtvar", [GFS2_LKS_DCOUNT] = "dlm", [GFS2_LKS_QCOUNT] = "queue", }; #define GFS2_NR_SBSTATS (ARRAY_SIZE(gfs2_gltype) * ARRAY_SIZE(gfs2_stype)) static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) { struct gfs2_sbd *sdp = seq->private; loff_t pos = *(loff_t *)iter_ptr; unsigned index = pos >> 3; unsigned subindex = pos & 0x07; int i; if (index == 0 && subindex != 0) return 0; seq_printf(seq, "%-10s %8s:", gfs2_gltype[index], (index == 0) ? "cpu": gfs2_stype[subindex]); for_each_possible_cpu(i) { const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i); if (index == 0) seq_printf(seq, " %15u", i); else seq_printf(seq, " %15llu", (unsigned long long)lkstats-> lkstats[index - 1].stats[subindex]); } seq_putc(seq, '\n'); return 0; } int __init gfs2_glock_init(void) { int i, ret; ret = rhashtable_init(&gl_hash_table, &ht_parms); if (ret < 0) return ret; glock_shrinker = shrinker_alloc(0, "gfs2-glock"); if (!glock_shrinker) { rhashtable_destroy(&gl_hash_table); return -ENOMEM; } glock_shrinker->count_objects = gfs2_glock_shrink_count; glock_shrinker->scan_objects = gfs2_glock_shrink_scan; shrinker_register(glock_shrinker); for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++) init_waitqueue_head(glock_wait_table + i); return 0; } void gfs2_glock_exit(void) { shrinker_free(glock_shrinker); rhashtable_destroy(&gl_hash_table); } static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi, loff_t n) { struct gfs2_glock *gl = gi->gl; if (gl) { if (n == 0) return; gfs2_glock_put_async(gl); } for (;;) { gl = rhashtable_walk_next(&gi->hti); if (IS_ERR_OR_NULL(gl)) { if (gl == ERR_PTR(-EAGAIN)) { n = 1; continue; } gl = NULL; break; } if (gl->gl_name.ln_sbd != gi->sdp) continue; if (n <= 1) { if (!lockref_get_not_dead(&gl->gl_lockref)) continue; break; } else { if (__lockref_is_dead(&gl->gl_lockref)) continue; n--; } } gi->gl = gl; } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct gfs2_glock_iter *gi = seq->private; loff_t n; /* * We can either stay where we are, skip to the next hash table * entry, or start from the beginning. */ if (*pos < gi->last_pos) { rhashtable_walk_exit(&gi->hti); rhashtable_walk_enter(&gl_hash_table, &gi->hti); n = *pos + 1; } else { n = *pos - gi->last_pos; } rhashtable_walk_start(&gi->hti); gfs2_glock_iter_next(gi, n); gi->last_pos = *pos; return gi->gl; } static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) { struct gfs2_glock_iter *gi = seq->private; (*pos)++; gi->last_pos = *pos; gfs2_glock_iter_next(gi, 1); return gi->gl; } static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) __releases(RCU) { struct gfs2_glock_iter *gi = seq->private; rhashtable_walk_stop(&gi->hti); } static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) { dump_glock(seq, iter_ptr, false); return 0; } static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) { preempt_disable(); if (*pos >= GFS2_NR_SBSTATS) return NULL; return pos; } static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) { (*pos)++; if (*pos >= GFS2_NR_SBSTATS) return NULL; return pos; } static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr) { preempt_enable(); } static const struct seq_operations gfs2_glock_seq_ops = { .start = gfs2_glock_seq_start, .next = gfs2_glock_seq_next, .stop = gfs2_glock_seq_stop, .show = gfs2_glock_seq_show, }; static const struct seq_operations gfs2_glstats_seq_ops = { .start = gfs2_glock_seq_start, .next = gfs2_glock_seq_next, .stop = gfs2_glock_seq_stop, .show = gfs2_glstats_seq_show, }; static const struct seq_operations gfs2_sbstats_sops = { .start = gfs2_sbstats_seq_start, .next = gfs2_sbstats_seq_next, .stop = gfs2_sbstats_seq_stop, .show = gfs2_sbstats_seq_show, }; #define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL) static int __gfs2_glocks_open(struct inode *inode, struct file *file, const struct seq_operations *ops) { int ret = seq_open_private(file, ops, sizeof(struct gfs2_glock_iter)); if (ret == 0) { struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; /* * Initially, we are "before" the first hash table entry; the * first call to rhashtable_walk_next gets us the first entry. */ gi->last_pos = -1; gi->gl = NULL; rhashtable_walk_enter(&gl_hash_table, &gi->hti); } return ret; } static int gfs2_glocks_open(struct inode *inode, struct file *file) { return __gfs2_glocks_open(inode, file, &gfs2_glock_seq_ops); } static int gfs2_glocks_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; if (gi->gl) gfs2_glock_put(gi->gl); rhashtable_walk_exit(&gi->hti); return seq_release_private(inode, file); } static int gfs2_glstats_open(struct inode *inode, struct file *file) { return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops); } static const struct file_operations gfs2_glocks_fops = { .owner = THIS_MODULE, .open = gfs2_glocks_open, .read = seq_read, .llseek = seq_lseek, .release = gfs2_glocks_release, }; static const struct file_operations gfs2_glstats_fops = { .owner = THIS_MODULE, .open = gfs2_glstats_open, .read = seq_read, .llseek = seq_lseek, .release = gfs2_glocks_release, }; struct gfs2_glockfd_iter { struct super_block *sb; unsigned int tgid; struct task_struct *task; unsigned int fd; struct file *file; }; static struct task_struct *gfs2_glockfd_next_task(struct gfs2_glockfd_iter *i) { struct pid_namespace *ns = task_active_pid_ns(current); struct pid *pid; if (i->task) put_task_struct(i->task); rcu_read_lock(); retry: i->task = NULL; pid = find_ge_pid(i->tgid, ns); if (pid) { i->tgid = pid_nr_ns(pid, ns); i->task = pid_task(pid, PIDTYPE_TGID); if (!i->task) { i->tgid++; goto retry; } get_task_struct(i->task); } rcu_read_unlock(); return i->task; } static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) { if (i->file) { fput(i->file); i->file = NULL; } for(;; i->fd++) { i->file = fget_task_next(i->task, &i->fd); if (!i->file) { i->fd = 0; break; } if (file_inode(i->file)->i_sb == i->sb) break; fput(i->file); } return i->file; } static void *gfs2_glockfd_seq_start(struct seq_file *seq, loff_t *pos) { struct gfs2_glockfd_iter *i = seq->private; if (*pos) return NULL; while (gfs2_glockfd_next_task(i)) { if (gfs2_glockfd_next_file(i)) return i; i->tgid++; } return NULL; } static void *gfs2_glockfd_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) { struct gfs2_glockfd_iter *i = seq->private; (*pos)++; i->fd++; do { if (gfs2_glockfd_next_file(i)) return i; i->tgid++; } while (gfs2_glockfd_next_task(i)); return NULL; } static void gfs2_glockfd_seq_stop(struct seq_file *seq, void *iter_ptr) { struct gfs2_glockfd_iter *i = seq->private; if (i->file) fput(i->file); if (i->task) put_task_struct(i->task); } static void gfs2_glockfd_seq_show_flock(struct seq_file *seq, struct gfs2_glockfd_iter *i) { struct gfs2_file *fp = i->file->private_data; struct gfs2_holder *fl_gh = &fp->f_fl_gh; struct lm_lockname gl_name = { .ln_type = LM_TYPE_RESERVED }; if (!READ_ONCE(fl_gh->gh_gl)) return; spin_lock(&i->file->f_lock); if (gfs2_holder_initialized(fl_gh)) gl_name = fl_gh->gh_gl->gl_name; spin_unlock(&i->file->f_lock); if (gl_name.ln_type != LM_TYPE_RESERVED) { seq_printf(seq, "%d %u %u/%llx\n", i->tgid, i->fd, gl_name.ln_type, (unsigned long long)gl_name.ln_number); } } static int gfs2_glockfd_seq_show(struct seq_file *seq, void *iter_ptr) { struct gfs2_glockfd_iter *i = seq->private; struct inode *inode = file_inode(i->file); struct gfs2_glock *gl; inode_lock_shared(inode); gl = GFS2_I(inode)->i_iopen_gh.gh_gl; if (gl) { seq_printf(seq, "%d %u %u/%llx\n", i->tgid, i->fd, gl->gl_name.ln_type, (unsigned long long)gl->gl_name.ln_number); } gfs2_glockfd_seq_show_flock(seq, i); inode_unlock_shared(inode); return 0; } static const struct seq_operations gfs2_glockfd_seq_ops = { .start = gfs2_glockfd_seq_start, .next = gfs2_glockfd_seq_next, .stop = gfs2_glockfd_seq_stop, .show = gfs2_glockfd_seq_show, }; static int gfs2_glockfd_open(struct inode *inode, struct file *file) { struct gfs2_glockfd_iter *i; struct gfs2_sbd *sdp = inode->i_private; i = __seq_open_private(file, &gfs2_glockfd_seq_ops, sizeof(struct gfs2_glockfd_iter)); if (!i) return -ENOMEM; i->sb = sdp->sd_vfs; return 0; } static const struct file_operations gfs2_glockfd_fops = { .owner = THIS_MODULE, .open = gfs2_glockfd_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, }; DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats); void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) { sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root); debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glocks_fops); debugfs_create_file("glockfd", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glockfd_fops); debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_glstats_fops); debugfs_create_file("sbstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, &gfs2_sbstats_fops); } void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp) { debugfs_remove_recursive(sdp->debugfs_dir); sdp->debugfs_dir = NULL; } void gfs2_register_debugfs(void) { gfs2_root = debugfs_create_dir("gfs2", NULL); } void gfs2_unregister_debugfs(void) { debugfs_remove(gfs2_root); gfs2_root = NULL; } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NFNETLINK_H #define _NFNETLINK_H #include <linux/netlink.h> #include <linux/capability.h> #include <net/netlink.h> #include <uapi/linux/netfilter/nfnetlink.h> struct nfnl_info { struct net *net; struct sock *sk; const struct nlmsghdr *nlh; const struct nfgenmsg *nfmsg; struct netlink_ext_ack *extack; }; enum nfnl_callback_type { NFNL_CB_UNSPEC = 0, NFNL_CB_MUTEX, NFNL_CB_RCU, NFNL_CB_BATCH, }; struct nfnl_callback { int (*call)(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]); const struct nla_policy *policy; enum nfnl_callback_type type; __u16 attr_count; }; enum nfnl_abort_action { NFNL_ABORT_NONE = 0, NFNL_ABORT_AUTOLOAD, NFNL_ABORT_VALIDATE, }; struct nfnetlink_subsystem { const char *name; __u8 subsys_id; /* nfnetlink subsystem ID */ __u8 cb_count; /* number of callbacks */ const struct nfnl_callback *cb; /* callback for individual types */ struct module *owner; int (*commit)(struct net *net, struct sk_buff *skb); int (*abort)(struct net *net, struct sk_buff *skb, enum nfnl_abort_action action); bool (*valid_genid)(struct net *net, u32 genid); }; int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n); int nfnetlink_has_listeners(struct net *net, unsigned int group); int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags); int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid); void nfnetlink_broadcast(struct net *net, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type) { return subsys << 8 | msg_type; } static inline void nfnl_fill_hdr(struct nlmsghdr *nlh, u8 family, u8 version, __be16 res_id) { struct nfgenmsg *nfmsg; nfmsg = nlmsg_data(nlh); nfmsg->nfgen_family = family; nfmsg->version = version; nfmsg->res_id = res_id; } static inline struct nlmsghdr *nfnl_msg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int flags, u8 family, u8 version, __be16 res_id) { struct nlmsghdr *nlh; nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags); if (!nlh) return NULL; nfnl_fill_hdr(nlh, family, version, res_id); return nlh; } void nfnl_lock(__u8 subsys_id); void nfnl_unlock(__u8 subsys_id); #ifdef CONFIG_PROVE_LOCKING bool lockdep_nfnl_is_held(__u8 subsys_id); #else static inline bool lockdep_nfnl_is_held(__u8 subsys_id) { return true; } #endif /* CONFIG_PROVE_LOCKING */ #define MODULE_ALIAS_NFNL_SUBSYS(subsys) \ MODULE_ALIAS("nfnetlink-subsys-" __stringify(subsys)) #endif /* _NFNETLINK_H */ |
54 54 2 2 4 3 1 3 4 13 1 1 1 13 19 19 16 3 3 3 3 10 10 10 10 9 9 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 | // SPDX-License-Identifier: GPL-2.0-only /* * Common helpers for stackable filesystems and backing files. * * Forked from fs/overlayfs/file.c. * * Copyright (C) 2017 Red Hat, Inc. * Copyright (C) 2023 CTERA Networks. */ #include <linux/fs.h> #include <linux/backing-file.h> #include <linux/splice.h> #include <linux/mm.h> #include "internal.h" /** * backing_file_open - open a backing file for kernel internal use * @user_path: path that the user reuqested to open * @flags: open flags * @real_path: path of the backing file * @cred: credentials for open * * Open a backing file for a stackable filesystem (e.g., overlayfs). * @user_path may be on the stackable filesystem and @real_path on the * underlying filesystem. In this case, we want to be able to return the * @user_path of the stackable filesystem. This is done by embedding the * returned file into a container structure that also stores the stacked * file's path, which can be retrieved using backing_file_user_path(). */ struct file *backing_file_open(const struct path *user_path, int flags, const struct path *real_path, const struct cred *cred) { struct file *f; int error; f = alloc_empty_backing_file(flags, cred); if (IS_ERR(f)) return f; path_get(user_path); *backing_file_user_path(f) = *user_path; error = vfs_open(real_path, f); if (error) { fput(f); f = ERR_PTR(error); } return f; } EXPORT_SYMBOL_GPL(backing_file_open); struct file *backing_tmpfile_open(const struct path *user_path, int flags, const struct path *real_parentpath, umode_t mode, const struct cred *cred) { struct mnt_idmap *real_idmap = mnt_idmap(real_parentpath->mnt); struct file *f; int error; f = alloc_empty_backing_file(flags, cred); if (IS_ERR(f)) return f; path_get(user_path); *backing_file_user_path(f) = *user_path; error = vfs_tmpfile(real_idmap, real_parentpath, f, mode); if (error) { fput(f); f = ERR_PTR(error); } return f; } EXPORT_SYMBOL(backing_tmpfile_open); struct backing_aio { struct kiocb iocb; refcount_t ref; struct kiocb *orig_iocb; /* used for aio completion */ void (*end_write)(struct kiocb *iocb, ssize_t); struct work_struct work; long res; }; static struct kmem_cache *backing_aio_cachep; #define BACKING_IOCB_MASK \ (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND) static rwf_t iocb_to_rw_flags(int flags) { return (__force rwf_t)(flags & BACKING_IOCB_MASK); } static void backing_aio_put(struct backing_aio *aio) { if (refcount_dec_and_test(&aio->ref)) { fput(aio->iocb.ki_filp); kmem_cache_free(backing_aio_cachep, aio); } } static void backing_aio_cleanup(struct backing_aio *aio, long res) { struct kiocb *iocb = &aio->iocb; struct kiocb *orig_iocb = aio->orig_iocb; orig_iocb->ki_pos = iocb->ki_pos; if (aio->end_write) aio->end_write(orig_iocb, res); backing_aio_put(aio); } static void backing_aio_rw_complete(struct kiocb *iocb, long res) { struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); struct kiocb *orig_iocb = aio->orig_iocb; if (iocb->ki_flags & IOCB_WRITE) kiocb_end_write(iocb); backing_aio_cleanup(aio, res); orig_iocb->ki_complete(orig_iocb, res); } static void backing_aio_complete_work(struct work_struct *work) { struct backing_aio *aio = container_of(work, struct backing_aio, work); backing_aio_rw_complete(&aio->iocb, aio->res); } static void backing_aio_queue_completion(struct kiocb *iocb, long res) { struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb); /* * Punt to a work queue to serialize updates of mtime/size. */ aio->res = res; INIT_WORK(&aio->work, backing_aio_complete_work); queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq, &aio->work); } static int backing_aio_init_wq(struct kiocb *iocb) { struct super_block *sb = file_inode(iocb->ki_filp)->i_sb; if (sb->s_dio_done_wq) return 0; return sb_init_dio_done_wq(sb); } ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter, struct kiocb *iocb, int flags, struct backing_file_ctx *ctx) { struct backing_aio *aio = NULL; const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) return -EIO; if (!iov_iter_count(iter)) return 0; if (iocb->ki_flags & IOCB_DIRECT && !(file->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; old_cred = override_creds(ctx->cred); if (is_sync_kiocb(iocb)) { rwf_t rwf = iocb_to_rw_flags(flags); ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf); } else { ret = -ENOMEM; aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); if (!aio) goto out; aio->orig_iocb = iocb; kiocb_clone(&aio->iocb, iocb, get_file(file)); aio->iocb.ki_complete = backing_aio_rw_complete; refcount_set(&aio->ref, 2); ret = vfs_iocb_iter_read(file, &aio->iocb, iter); backing_aio_put(aio); if (ret != -EIOCBQUEUED) backing_aio_cleanup(aio, ret); } out: revert_creds(old_cred); if (ctx->accessed) ctx->accessed(iocb->ki_filp); return ret; } EXPORT_SYMBOL_GPL(backing_file_read_iter); ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter, struct kiocb *iocb, int flags, struct backing_file_ctx *ctx) { const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) return -EIO; if (!iov_iter_count(iter)) return 0; ret = file_remove_privs(iocb->ki_filp); if (ret) return ret; if (iocb->ki_flags & IOCB_DIRECT && !(file->f_mode & FMODE_CAN_ODIRECT)) return -EINVAL; /* * Stacked filesystems don't support deferred completions, don't copy * this property in case it is set by the issuer. */ flags &= ~IOCB_DIO_CALLER_COMP; old_cred = override_creds(ctx->cred); if (is_sync_kiocb(iocb)) { rwf_t rwf = iocb_to_rw_flags(flags); ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf); if (ctx->end_write) ctx->end_write(iocb, ret); } else { struct backing_aio *aio; ret = backing_aio_init_wq(iocb); if (ret) goto out; ret = -ENOMEM; aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL); if (!aio) goto out; aio->orig_iocb = iocb; aio->end_write = ctx->end_write; kiocb_clone(&aio->iocb, iocb, get_file(file)); aio->iocb.ki_flags = flags; aio->iocb.ki_complete = backing_aio_queue_completion; refcount_set(&aio->ref, 2); ret = vfs_iocb_iter_write(file, &aio->iocb, iter); backing_aio_put(aio); if (ret != -EIOCBQUEUED) backing_aio_cleanup(aio, ret); } out: revert_creds(old_cred); return ret; } EXPORT_SYMBOL_GPL(backing_file_write_iter); ssize_t backing_file_splice_read(struct file *in, struct kiocb *iocb, struct pipe_inode_info *pipe, size_t len, unsigned int flags, struct backing_file_ctx *ctx) { const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING))) return -EIO; old_cred = override_creds(ctx->cred); ret = vfs_splice_read(in, &iocb->ki_pos, pipe, len, flags); revert_creds(old_cred); if (ctx->accessed) ctx->accessed(iocb->ki_filp); return ret; } EXPORT_SYMBOL_GPL(backing_file_splice_read); ssize_t backing_file_splice_write(struct pipe_inode_info *pipe, struct file *out, struct kiocb *iocb, size_t len, unsigned int flags, struct backing_file_ctx *ctx) { const struct cred *old_cred; ssize_t ret; if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING))) return -EIO; if (!out->f_op->splice_write) return -EINVAL; ret = file_remove_privs(iocb->ki_filp); if (ret) return ret; old_cred = override_creds(ctx->cred); file_start_write(out); ret = out->f_op->splice_write(pipe, out, &iocb->ki_pos, len, flags); file_end_write(out); revert_creds(old_cred); if (ctx->end_write) ctx->end_write(iocb, ret); return ret; } EXPORT_SYMBOL_GPL(backing_file_splice_write); int backing_file_mmap(struct file *file, struct vm_area_struct *vma, struct backing_file_ctx *ctx) { const struct cred *old_cred; struct file *user_file = vma->vm_file; int ret; if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING))) return -EIO; if (!file->f_op->mmap) return -ENODEV; vma_set_file(vma, file); old_cred = override_creds(ctx->cred); ret = call_mmap(vma->vm_file, vma); revert_creds(old_cred); if (ctx->accessed) ctx->accessed(user_file); return ret; } EXPORT_SYMBOL_GPL(backing_file_mmap); static int __init backing_aio_init(void) { backing_aio_cachep = KMEM_CACHE(backing_aio, SLAB_HWCACHE_ALIGN); if (!backing_aio_cachep) return -ENOMEM; return 0; } fs_initcall(backing_aio_init); |
19 5 2 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 2 12 13 4 1 18 8 6 1 1 1 1 13 13 13 13 12 12 13 13 13 13 1 1 1 2 2 2 2 2 3 3 3 3 3 4 2 2 4 4 4 1 3 2 2 2 1 1 1 1 1 1 1 1 1 3 3 18 47 45 2 47 7 7 7 5 2 7 19 19 19 2 2 2 8 8 8 8 8 2 8 2 19 6 6 6 1 1 6 72 72 72 82 76 1 4 2 76 2 1 3 70 4 1 67 6 1 70 2 1 2 2 14 9 2 19 19 2 19 19 18 1 19 19 2 18 1 18 5 1 14 1 58 56 37 10 23 20 69 43 5 2 19 1 1 1 15 1 49 1 1 5 4 54 1 2 57 3 56 49 20 49 49 49 44 5 49 30 69 30 49 13 2 47 35 12 12 12 47 42 5 1 1 2 12 9 12 7 6 72 6 15 3 2 67 66 6 71 1 71 1 71 71 1 71 1 1 71 69 3 70 2 71 1 66 6 71 1 70 1 1 72 71 1 70 2 68 4 69 2 72 72 2 2 1 68 70 69 1 67 2 66 3 1 72 3 69 8 8 6 2 49 87 82 8 27 87 87 2 1 2 1 2 1 1 1 1 1 2 88 88 88 88 88 87 474 165 2093 2092 474 4 2 279 279 8 8 8 26 23 8 85 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 | // SPDX-License-Identifier: GPL-2.0-only /* * VXLAN: Virtual eXtensible Local Area Network * * Copyright (c) 2012-2013 Vyatta Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/udp.h> #include <linux/igmp.h> #include <linux/if_ether.h> #include <linux/ethtool.h> #include <net/arp.h> #include <net/ndisc.h> #include <net/gro.h> #include <net/ipv6_stubs.h> #include <net/ip.h> #include <net/icmp.h> #include <net/rtnetlink.h> #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/tun_proto.h> #include <net/vxlan.h> #include <net/nexthop.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ip6_tunnel.h> #include <net/ip6_checksum.h> #endif #include "vxlan_private.h" #define VXLAN_VERSION "0.1" #define FDB_AGE_DEFAULT 300 /* 5 min */ #define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ /* UDP port for VXLAN traffic. * The IANA assigned port is 4789, but the Linux default is 8472 * for compatibility with early adopters. */ static unsigned short vxlan_port __read_mostly = 8472; module_param_named(udp_port, vxlan_port, ushort, 0444); MODULE_PARM_DESC(udp_port, "Destination UDP port"); static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); unsigned int vxlan_net_id; const u8 all_zeros_mac[ETH_ALEN + 2]; static struct rtnl_link_ops vxlan_link_ops; static int vxlan_sock_add(struct vxlan_dev *vxlan); static void vxlan_vs_del_dev(struct vxlan_dev *vxlan); /* salt for hash table */ static u32 vxlan_salt __read_mostly; static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) { return vs->flags & VXLAN_F_COLLECT_METADATA || ip_tunnel_collect_metadata(); } /* Find VXLAN socket based on network namespace, address family, UDP port, * enabled unshareable flags and socket device binding (see l3mdev with * non-default VRF). */ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family, __be16 port, u32 flags, int ifindex) { struct vxlan_sock *vs; flags &= VXLAN_F_RCV_FLAGS; hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { if (inet_sk(vs->sock->sk)->inet_sport == port && vxlan_get_sk_family(vs) == family && vs->flags == flags && vs->sock->sk->sk_bound_dev_if == ifindex) return vs; } return NULL; } static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex, __be32 vni, struct vxlan_vni_node **vninode) { struct vxlan_vni_node *vnode; struct vxlan_dev_node *node; /* For flow based devices, map all packets to VNI 0 */ if (vs->flags & VXLAN_F_COLLECT_METADATA && !(vs->flags & VXLAN_F_VNIFILTER)) vni = 0; hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) { if (!node->vxlan) continue; vnode = NULL; if (node->vxlan->cfg.flags & VXLAN_F_VNIFILTER) { vnode = vxlan_vnifilter_lookup(node->vxlan, vni); if (!vnode) continue; } else if (node->vxlan->default_dst.remote_vni != vni) { continue; } if (IS_ENABLED(CONFIG_IPV6)) { const struct vxlan_config *cfg = &node->vxlan->cfg; if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) && cfg->remote_ifindex != ifindex) continue; } if (vninode) *vninode = vnode; return node->vxlan; } return NULL; } /* Look up VNI in a per net namespace table */ static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex, __be32 vni, sa_family_t family, __be16 port, u32 flags) { struct vxlan_sock *vs; vs = vxlan_find_sock(net, family, port, flags, ifindex); if (!vs) return NULL; return vxlan_vs_find_vni(vs, ifindex, vni, NULL); } /* Fill in neighbour message in skbuff. */ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, const struct vxlan_fdb *fdb, u32 portid, u32 seq, int type, unsigned int flags, const struct vxlan_rdst *rdst) { unsigned long now = jiffies; struct nda_cacheinfo ci; bool send_ip, send_eth; struct nlmsghdr *nlh; struct nexthop *nh; struct ndmsg *ndm; int nh_family; u32 nh_id; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) return -EMSGSIZE; ndm = nlmsg_data(nlh); memset(ndm, 0, sizeof(*ndm)); send_eth = send_ip = true; rcu_read_lock(); nh = rcu_dereference(fdb->nh); if (nh) { nh_family = nexthop_get_family(nh); nh_id = nh->id; } rcu_read_unlock(); if (type == RTM_GETNEIGH) { if (rdst) { send_ip = !vxlan_addr_any(&rdst->remote_ip); ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; } else if (nh) { ndm->ndm_family = nh_family; } send_eth = !is_zero_ether_addr(fdb->eth_addr); } else ndm->ndm_family = AF_BRIDGE; ndm->ndm_state = fdb->state; ndm->ndm_ifindex = vxlan->dev->ifindex; ndm->ndm_flags = fdb->flags; if (rdst && rdst->offloaded) ndm->ndm_flags |= NTF_OFFLOADED; ndm->ndm_type = RTN_UNICAST; if (!net_eq(dev_net(vxlan->dev), vxlan->net) && nla_put_s32(skb, NDA_LINK_NETNSID, peernet2id(dev_net(vxlan->dev), vxlan->net))) goto nla_put_failure; if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) goto nla_put_failure; if (nh) { if (nla_put_u32(skb, NDA_NH_ID, nh_id)) goto nla_put_failure; } else if (rdst) { if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) goto nla_put_failure; if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port && nla_put_be16(skb, NDA_PORT, rdst->remote_port)) goto nla_put_failure; if (rdst->remote_vni != vxlan->default_dst.remote_vni && nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni))) goto nla_put_failure; if (rdst->remote_ifindex && nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) goto nla_put_failure; } if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni && nla_put_u32(skb, NDA_SRC_VNI, be32_to_cpu(fdb->vni))) goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - fdb->used); ci.ndm_confirmed = 0; ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); ci.ndm_refcnt = 0; if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t vxlan_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */ + nla_total_size(sizeof(__be16)) /* NDA_PORT */ + nla_total_size(sizeof(__be32)) /* NDA_VNI */ + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ + nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */ + nla_total_size(sizeof(struct nda_cacheinfo)); } static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, struct vxlan_rdst *rd, int type) { struct net *net = dev_net(vxlan->dev); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd); if (err < 0) { /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan, const struct vxlan_fdb *fdb, const struct vxlan_rdst *rd, struct netlink_ext_ack *extack, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { fdb_info->info.dev = vxlan->dev; fdb_info->info.extack = extack; fdb_info->remote_ip = rd->remote_ip; fdb_info->remote_port = rd->remote_port; fdb_info->remote_vni = rd->remote_vni; fdb_info->remote_ifindex = rd->remote_ifindex; memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN); fdb_info->vni = fdb->vni; fdb_info->offloaded = rd->offloaded; fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER; } static int vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, struct vxlan_rdst *rd, bool adding, struct netlink_ext_ack *extack) { struct switchdev_notifier_vxlan_fdb_info info; enum switchdev_notifier_type notifier_type; int ret; if (WARN_ON(!rd)) return 0; notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE; vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, NULL, &info); ret = call_switchdev_notifiers(notifier_type, vxlan->dev, &info.info, extack); return notifier_to_errno(ret); } static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, struct vxlan_rdst *rd, int type, bool swdev_notify, struct netlink_ext_ack *extack) { int err; if (swdev_notify && rd) { switch (type) { case RTM_NEWNEIGH: err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, true, extack); if (err) return err; break; case RTM_DELNEIGH: vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, false, extack); break; } } __vxlan_fdb_notify(vxlan, fdb, rd, type); return 0; } static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb f = { .state = NUD_STALE, }; struct vxlan_rdst remote = { .remote_ip = *ipa, /* goes to NDA_DST */ .remote_vni = cpu_to_be32(VXLAN_N_VID), }; vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) { struct vxlan_fdb f = { .state = NUD_STALE, }; struct vxlan_rdst remote = { }; memcpy(f.eth_addr, eth_addr, ETH_ALEN); vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); } /* Hash Ethernet address */ static u32 eth_hash(const unsigned char *addr) { u64 value = get_unaligned((u64 *)addr); /* only want 6 bytes */ #ifdef __BIG_ENDIAN value >>= 16; #else value <<= 16; #endif return hash_64(value, FDB_HASH_BITS); } u32 eth_vni_hash(const unsigned char *addr, __be32 vni) { /* use 1 byte of OUI and 3 bytes of NIC */ u32 key = get_unaligned((u32 *)(addr + 2)); return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1); } u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) return eth_vni_hash(mac, vni); else return eth_hash(mac); } /* Hash chain to use given mac address */ static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)]; } /* Look up Ethernet address in forwarding table */ static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni); struct vxlan_fdb *f; hlist_for_each_entry_rcu(f, head, hlist) { if (ether_addr_equal(mac, f->eth_addr)) { if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { if (vni == f->vni) return f; } else { return f; } } } return NULL; } static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni) { struct vxlan_fdb *f; f = __vxlan_find_mac(vxlan, mac, vni); if (f && f->used != jiffies) f->used = jiffies; return f; } /* caller should hold vxlan->hash_lock */ static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, union vxlan_addr *ip, __be16 port, __be32 vni, __u32 ifindex) { struct vxlan_rdst *rd; list_for_each_entry(rd, &f->remotes, list) { if (vxlan_addr_equal(&rd->remote_ip, ip) && rd->remote_port == port && rd->remote_vni == vni && rd->remote_ifindex == ifindex) return rd; } return NULL; } int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); u8 eth_addr[ETH_ALEN + 2] = { 0 }; struct vxlan_rdst *rdst; struct vxlan_fdb *f; int rc = 0; if (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)) return -EINVAL; ether_addr_copy(eth_addr, mac); rcu_read_lock(); f = __vxlan_find_mac(vxlan, eth_addr, vni); if (!f) { rc = -ENOENT; goto out; } rdst = first_remote_rcu(f); vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, NULL, fdb_info); out: rcu_read_unlock(); return rc; } EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc); static int vxlan_fdb_notify_one(struct notifier_block *nb, const struct vxlan_dev *vxlan, const struct vxlan_fdb *f, const struct vxlan_rdst *rdst, struct netlink_ext_ack *extack) { struct switchdev_notifier_vxlan_fdb_info fdb_info; int rc; vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, extack, &fdb_info); rc = nb->notifier_call(nb, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE, &fdb_info); return notifier_to_errno(rc); } int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; unsigned int h; int rc = 0; if (!netif_is_vxlan(dev)) return -EINVAL; vxlan = netdev_priv(dev); for (h = 0; h < FDB_HASH_SIZE; ++h) { spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) { if (f->vni == vni) { list_for_each_entry(rdst, &f->remotes, list) { rc = vxlan_fdb_notify_one(nb, vxlan, f, rdst, extack); if (rc) goto unlock; } } } spin_unlock_bh(&vxlan->hash_lock[h]); } return 0; unlock: spin_unlock_bh(&vxlan->hash_lock[h]); return rc; } EXPORT_SYMBOL_GPL(vxlan_fdb_replay); void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) { struct vxlan_dev *vxlan; struct vxlan_rdst *rdst; struct vxlan_fdb *f; unsigned int h; if (!netif_is_vxlan(dev)) return; vxlan = netdev_priv(dev); for (h = 0; h < FDB_HASH_SIZE; ++h) { spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) if (f->vni == vni) list_for_each_entry(rdst, &f->remotes, list) rdst->offloaded = false; spin_unlock_bh(&vxlan->hash_lock[h]); } } EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload); /* Replace destination of unicast mac */ static int vxlan_fdb_replace(struct vxlan_fdb *f, union vxlan_addr *ip, __be16 port, __be32 vni, __u32 ifindex, struct vxlan_rdst *oldrd) { struct vxlan_rdst *rd; rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); if (rd) return 0; rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list); if (!rd) return 0; *oldrd = *rd; dst_cache_reset(&rd->dst_cache); rd->remote_ip = *ip; rd->remote_port = port; rd->remote_vni = vni; rd->remote_ifindex = ifindex; rd->offloaded = false; return 1; } /* Add/update destinations for multicast */ static int vxlan_fdb_append(struct vxlan_fdb *f, union vxlan_addr *ip, __be16 port, __be32 vni, __u32 ifindex, struct vxlan_rdst **rdp) { struct vxlan_rdst *rd; rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); if (rd) return 0; rd = kmalloc(sizeof(*rd), GFP_ATOMIC); if (rd == NULL) return -ENOMEM; if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) { kfree(rd); return -ENOMEM; } rd->remote_ip = *ip; rd->remote_port = port; rd->offloaded = false; rd->remote_vni = vni; rd->remote_ifindex = ifindex; list_add_tail_rcu(&rd->list, &f->remotes); *rdp = rd; return 1; } static bool vxlan_parse_gpe_proto(const struct vxlanhdr *hdr, __be16 *protocol) { const struct vxlanhdr_gpe *gpe = (const struct vxlanhdr_gpe *)hdr; /* Need to have Next Protocol set for interfaces in GPE mode. */ if (!gpe->np_applied) return false; /* "The initial version is 0. If a receiver does not support the * version indicated it MUST drop the packet. */ if (gpe->version != 0) return false; /* "When the O bit is set to 1, the packet is an OAM packet and OAM * processing MUST occur." However, we don't implement OAM * processing, thus drop the packet. */ if (gpe->oam_flag) return false; *protocol = tun_p_to_eth_p(gpe->next_protocol); if (!*protocol) return false; return true; } static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, unsigned int off, struct vxlanhdr *vh, size_t hdrlen, __be32 vni_field, struct gro_remcsum *grc, bool nopartial) { size_t start, offset; if (skb->remcsum_offload) return vh; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; start = vxlan_rco_start(vni_field); offset = start + vxlan_rco_offset(vni_field); vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen, start, offset, grc, nopartial); skb->remcsum_offload = 1; return vh; } static struct vxlanhdr *vxlan_gro_prepare_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb, struct gro_remcsum *grc) { struct sk_buff *p; struct vxlanhdr *vh, *vh2; unsigned int hlen, off_vx; struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk); __be32 flags; skb_gro_remcsum_init(grc); off_vx = skb_gro_offset(skb); hlen = off_vx + sizeof(*vh); vh = skb_gro_header(skb, hlen, off_vx); if (unlikely(!vh)) return NULL; skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); flags = vh->vx_flags; if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), vh->vx_vni, grc, !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL)); if (!vh) return NULL; } skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) continue; vh2 = (struct vxlanhdr *)(p->data + off_vx); if (vh->vx_flags != vh2->vx_flags || vh->vx_vni != vh2->vx_vni) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } return vh; } static struct sk_buff *vxlan_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { struct sk_buff *pp = NULL; struct gro_remcsum grc; int flush = 1; if (vxlan_gro_prepare_receive(sk, head, skb, &grc)) { pp = call_gro_receive(eth_gro_receive, head, skb); flush = 0; } skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } static struct sk_buff *vxlan_gpe_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { const struct packet_offload *ptype; struct sk_buff *pp = NULL; struct gro_remcsum grc; struct vxlanhdr *vh; __be16 protocol; int flush = 1; vh = vxlan_gro_prepare_receive(sk, head, skb, &grc); if (vh) { if (!vxlan_parse_gpe_proto(vh, &protocol)) goto out; ptype = gro_find_receive_by_type(protocol); if (!ptype) goto out; pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; } out: skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { /* Sets 'skb->inner_mac_header' since we are always called with * 'skb->encapsulation' set. */ return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); } static int vxlan_gpe_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct vxlanhdr *vh = (struct vxlanhdr *)(skb->data + nhoff); const struct packet_offload *ptype; int err = -ENOSYS; __be16 protocol; if (!vxlan_parse_gpe_proto(vh, &protocol)) return err; ptype = gro_find_complete_by_type(protocol); if (ptype) err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); return err; } static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac, __u16 state, __be32 src_vni, __u16 ndm_flags) { struct vxlan_fdb *f; f = kmalloc(sizeof(*f), GFP_ATOMIC); if (!f) return NULL; f->state = state; f->flags = ndm_flags; f->updated = f->used = jiffies; f->vni = src_vni; f->nh = NULL; RCU_INIT_POINTER(f->vdev, vxlan); INIT_LIST_HEAD(&f->nh_list); INIT_LIST_HEAD(&f->remotes); memcpy(f->eth_addr, mac, ETH_ALEN); return f; } static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac, __be32 src_vni, struct vxlan_fdb *f) { ++vxlan->addrcnt; hlist_add_head_rcu(&f->hlist, vxlan_fdb_head(vxlan, mac, src_vni)); } static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, u32 nhid, struct netlink_ext_ack *extack) { struct nexthop *old_nh = rtnl_dereference(fdb->nh); struct nexthop *nh; int err = -EINVAL; if (old_nh && old_nh->id == nhid) return 0; nh = nexthop_find_by_id(vxlan->net, nhid); if (!nh) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto err_inval; } if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); nh = NULL; goto err_inval; } if (!nexthop_is_fdb(nh)) { NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop"); goto err_inval; } if (!nexthop_is_multipath(nh)) { NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group"); goto err_inval; } /* check nexthop group family */ switch (vxlan->default_dst.remote_ip.sa.sa_family) { case AF_INET: if (!nexthop_has_v4(nh)) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); goto err_inval; } break; case AF_INET6: if (nexthop_has_v4(nh)) { err = -EAFNOSUPPORT; NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); goto err_inval; } } if (old_nh) { list_del_rcu(&fdb->nh_list); nexthop_put(old_nh); } rcu_assign_pointer(fdb->nh, nh); list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list); return 1; err_inval: if (nh) nexthop_put(nh); return err; } int vxlan_fdb_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, struct vxlan_fdb **fdb, struct netlink_ext_ack *extack) { struct vxlan_rdst *rd = NULL; struct vxlan_fdb *f; int rc; if (vxlan->cfg.addrmax && vxlan->addrcnt >= vxlan->cfg.addrmax) return -ENOSPC; netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags); if (!f) return -ENOMEM; if (nhid) rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); else rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); if (rc < 0) goto errout; *fdb = f; return 0; errout: kfree(f); return rc; } static void __vxlan_fdb_free(struct vxlan_fdb *f) { struct vxlan_rdst *rd, *nd; struct nexthop *nh; nh = rcu_dereference_raw(f->nh); if (nh) { rcu_assign_pointer(f->nh, NULL); rcu_assign_pointer(f->vdev, NULL); nexthop_put(nh); } list_for_each_entry_safe(rd, nd, &f->remotes, list) { dst_cache_destroy(&rd->dst_cache); kfree(rd); } kfree(f); } static void vxlan_fdb_free(struct rcu_head *head) { struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); __vxlan_fdb_free(f); } static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, bool do_notify, bool swdev_notify) { struct vxlan_rdst *rd; netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr); --vxlan->addrcnt; if (do_notify) { if (rcu_access_pointer(f->nh)) vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH, swdev_notify, NULL); else list_for_each_entry(rd, &f->remotes, list) vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL); } hlist_del_rcu(&f->hlist); list_del_rcu(&f->nh_list); call_rcu(&f->rcu, vxlan_fdb_free); } static void vxlan_dst_free(struct rcu_head *head) { struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu); dst_cache_destroy(&rd->dst_cache); kfree(rd); } static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 vni, __u32 ifindex, __u16 ndm_flags, struct vxlan_fdb *f, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { __u16 fdb_flags = (ndm_flags & ~NTF_USE); struct vxlan_rdst *rd = NULL; struct vxlan_rdst oldrd; int notify = 0; int rc = 0; int err; if (nhid && !rcu_access_pointer(f->nh)) { NL_SET_ERR_MSG(extack, "Cannot replace an existing non nexthop fdb with a nexthop"); return -EOPNOTSUPP; } if (nhid && (flags & NLM_F_APPEND)) { NL_SET_ERR_MSG(extack, "Cannot append to a nexthop fdb"); return -EOPNOTSUPP; } /* Do not allow an externally learned entry to take over an entry added * by the user. */ if (!(fdb_flags & NTF_EXT_LEARNED) || !(f->flags & NTF_VXLAN_ADDED_BY_USER)) { if (f->state != state) { f->state = state; f->updated = jiffies; notify = 1; } if (f->flags != fdb_flags) { f->flags = fdb_flags; f->updated = jiffies; notify = 1; } } if ((flags & NLM_F_REPLACE)) { /* Only change unicasts */ if (!(is_multicast_ether_addr(f->eth_addr) || is_zero_ether_addr(f->eth_addr))) { if (nhid) { rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); if (rc < 0) return rc; } else { rc = vxlan_fdb_replace(f, ip, port, vni, ifindex, &oldrd); } notify |= rc; } else { NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries"); return -EOPNOTSUPP; } } if ((flags & NLM_F_APPEND) && (is_multicast_ether_addr(f->eth_addr) || is_zero_ether_addr(f->eth_addr))) { rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); if (rc < 0) return rc; notify |= rc; } if (ndm_flags & NTF_USE) f->used = jiffies; if (notify) { if (rd == NULL) rd = first_remote_rtnl(f); err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH, swdev_notify, extack); if (err) goto err_notify; } return 0; err_notify: if (nhid) return err; if ((flags & NLM_F_REPLACE) && rc) *rd = oldrd; else if ((flags & NLM_F_APPEND) && rc) { list_del_rcu(&rd->list); call_rcu(&rd->rcu, vxlan_dst_free); } return err; } static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { __u16 fdb_flags = (ndm_flags & ~NTF_USE); struct vxlan_fdb *f; int rc; /* Disallow replace to add a multicast entry */ if ((flags & NLM_F_REPLACE) && (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac))) return -EOPNOTSUPP; netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni, vni, ifindex, fdb_flags, nhid, &f, extack); if (rc < 0) return rc; vxlan_fdb_insert(vxlan, mac, src_vni, f); rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, swdev_notify, extack); if (rc) goto err_notify; return 0; err_notify: vxlan_fdb_destroy(vxlan, f, false, false); return rc; } /* Add new entry to forwarding table -- assumes lock held */ int vxlan_fdb_update(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { struct vxlan_fdb *f; f = __vxlan_find_mac(vxlan, mac, src_vni); if (f) { if (flags & NLM_F_EXCL) { netdev_dbg(vxlan->dev, "lost race to create %pM\n", mac); return -EEXIST; } return vxlan_fdb_update_existing(vxlan, ip, state, flags, port, vni, ifindex, ndm_flags, f, nhid, swdev_notify, extack); } else { if (!(flags & NLM_F_CREATE)) return -ENOENT; return vxlan_fdb_update_create(vxlan, mac, ip, state, flags, port, src_vni, vni, ifindex, ndm_flags, nhid, swdev_notify, extack); } } static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, struct vxlan_rdst *rd, bool swdev_notify) { list_del_rcu(&rd->list); vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL); call_rcu(&rd->rcu, vxlan_dst_free); } static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, union vxlan_addr *ip, __be16 *port, __be32 *src_vni, __be32 *vni, u32 *ifindex, u32 *nhid, struct netlink_ext_ack *extack) { struct net *net = dev_net(vxlan->dev); int err; if (tb[NDA_NH_ID] && (tb[NDA_DST] || tb[NDA_VNI] || tb[NDA_IFINDEX] || tb[NDA_PORT])) { NL_SET_ERR_MSG(extack, "DST, VNI, ifindex and port are mutually exclusive with NH_ID"); return -EINVAL; } if (tb[NDA_DST]) { err = vxlan_nla_get_addr(ip, tb[NDA_DST]); if (err) { NL_SET_ERR_MSG(extack, "Unsupported address family"); return err; } } else { union vxlan_addr *remote = &vxlan->default_dst.remote_ip; if (remote->sa.sa_family == AF_INET) { ip->sin.sin_addr.s_addr = htonl(INADDR_ANY); ip->sa.sa_family = AF_INET; #if IS_ENABLED(CONFIG_IPV6) } else { ip->sin6.sin6_addr = in6addr_any; ip->sa.sa_family = AF_INET6; #endif } } if (tb[NDA_PORT]) { if (nla_len(tb[NDA_PORT]) != sizeof(__be16)) { NL_SET_ERR_MSG(extack, "Invalid vxlan port"); return -EINVAL; } *port = nla_get_be16(tb[NDA_PORT]); } else { *port = vxlan->cfg.dst_port; } if (tb[NDA_VNI]) { if (nla_len(tb[NDA_VNI]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid vni"); return -EINVAL; } *vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); } else { *vni = vxlan->default_dst.remote_vni; } if (tb[NDA_SRC_VNI]) { if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid src vni"); return -EINVAL; } *src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI])); } else { *src_vni = vxlan->default_dst.remote_vni; } if (tb[NDA_IFINDEX]) { struct net_device *tdev; if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid ifindex"); return -EINVAL; } *ifindex = nla_get_u32(tb[NDA_IFINDEX]); tdev = __dev_get_by_index(net, *ifindex); if (!tdev) { NL_SET_ERR_MSG(extack, "Device not found"); return -EADDRNOTAVAIL; } } else { *ifindex = 0; } *nhid = nla_get_u32_default(tb[NDA_NH_ID], 0); return 0; } /* Add static entry (via netlink) */ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, bool *notified, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); /* struct net *net = dev_net(vxlan->dev); */ union vxlan_addr ip; __be16 port; __be32 src_vni, vni; u32 ifindex, nhid; u32 hash_index; int err; if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { pr_info("RTM_NEWNEIGH with invalid state %#x\n", ndm->ndm_state); return -EINVAL; } if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID])) return -EINVAL; err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, &nhid, extack); if (err) return err; if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family) return -EAFNOSUPPORT; hash_index = fdb_head_index(vxlan, addr, src_vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, port, src_vni, vni, ifindex, ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER, nhid, true, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); if (!err) *notified = true; return err; } int __vxlan_fdb_delete(struct vxlan_dev *vxlan, const unsigned char *addr, union vxlan_addr ip, __be16 port, __be32 src_vni, __be32 vni, u32 ifindex, bool swdev_notify) { struct vxlan_rdst *rd = NULL; struct vxlan_fdb *f; int err = -ENOENT; f = vxlan_find_mac(vxlan, addr, src_vni); if (!f) return err; if (!vxlan_addr_any(&ip)) { rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex); if (!rd) goto out; } /* remove a destination if it's not the only one on the list, * otherwise destroy the fdb entry */ if (rd && !list_is_singular(&f->remotes)) { vxlan_fdb_dst_destroy(vxlan, f, rd, swdev_notify); goto out; } vxlan_fdb_destroy(vxlan, f, true, swdev_notify); out: return 0; } /* Delete entry (via netlink) */ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); union vxlan_addr ip; __be32 src_vni, vni; u32 ifindex, nhid; u32 hash_index; __be16 port; int err; err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, &nhid, extack); if (err) return err; hash_index = fdb_head_index(vxlan, addr, src_vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex, true); spin_unlock_bh(&vxlan->hash_lock[hash_index]); if (!err) *notified = true; return err; } /* Dump forwarding table */ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct vxlan_dev *vxlan = netdev_priv(dev); unsigned int h; int err = 0; for (h = 0; h < FDB_HASH_SIZE; ++h) { struct vxlan_fdb *f; rcu_read_lock(); hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { struct vxlan_rdst *rd; if (rcu_access_pointer(f->nh)) { if (*idx < ctx->fdb_idx) goto skip_nh; err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, NULL); if (err < 0) { rcu_read_unlock(); goto out; } skip_nh: *idx += 1; continue; } list_for_each_entry_rcu(rd, &f->remotes, list) { if (*idx < ctx->fdb_idx) goto skip; err = vxlan_fdb_info(skb, vxlan, f, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, rd); if (err < 0) { rcu_read_unlock(); goto out; } skip: *idx += 1; } } rcu_read_unlock(); } out: return err; } static int vxlan_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u32 portid, u32 seq, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; __be32 vni; int err; if (tb[NDA_VNI]) vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); else vni = vxlan->default_dst.remote_vni; rcu_read_lock(); f = __vxlan_find_mac(vxlan, addr, vni); if (!f) { NL_SET_ERR_MSG(extack, "Fdb entry not found"); err = -ENOENT; goto errout; } err = vxlan_fdb_info(skb, vxlan, f, portid, seq, RTM_NEWNEIGH, 0, first_remote_rcu(f)); errout: rcu_read_unlock(); return err; } /* Watch incoming packets to learn mapping between Ethernet address * and Tunnel endpoint. */ static enum skb_drop_reason vxlan_snoop(struct net_device *dev, union vxlan_addr *src_ip, const u8 *src_mac, u32 src_ifindex, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; u32 ifindex = 0; /* Ignore packets from invalid src-address */ if (!is_valid_ether_addr(src_mac)) return SKB_DROP_REASON_MAC_INVALID_SOURCE; #if IS_ENABLED(CONFIG_IPV6) if (src_ip->sa.sa_family == AF_INET6 && (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)) ifindex = src_ifindex; #endif f = vxlan_find_mac(vxlan, src_mac, vni); if (likely(f)) { struct vxlan_rdst *rdst = first_remote_rcu(f); if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) && rdst->remote_ifindex == ifindex)) return SKB_NOT_DROPPED_YET; /* Don't migrate static entries, drop packets */ if (f->state & (NUD_PERMANENT | NUD_NOARP)) return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS; /* Don't override an fdb with nexthop with a learnt entry */ if (rcu_access_pointer(f->nh)) return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS; if (net_ratelimit()) netdev_info(dev, "%pM migrated from %pIS to %pIS\n", src_mac, &rdst->remote_ip.sa, &src_ip->sa); rdst->remote_ip = *src_ip; f->updated = jiffies; vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL); } else { u32 hash_index = fdb_head_index(vxlan, src_mac, vni); /* learned new entry */ spin_lock(&vxlan->hash_lock[hash_index]); /* close off race between vxlan_flush and incoming packets */ if (netif_running(dev)) vxlan_fdb_update(vxlan, src_mac, src_ip, NUD_REACHABLE, NLM_F_EXCL|NLM_F_CREATE, vxlan->cfg.dst_port, vni, vxlan->default_dst.remote_vni, ifindex, NTF_SELF, 0, true, NULL); spin_unlock(&vxlan->hash_lock[hash_index]); } return SKB_NOT_DROPPED_YET; } static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) { struct vxlan_net *vn; if (!vs) return false; if (!refcount_dec_and_test(&vs->refcnt)) return false; vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id); spin_lock(&vn->sock_lock); hlist_del_rcu(&vs->hlist); udp_tunnel_notify_del_rx_port(vs->sock, (vs->flags & VXLAN_F_GPE) ? UDP_TUNNEL_TYPE_VXLAN_GPE : UDP_TUNNEL_TYPE_VXLAN); spin_unlock(&vn->sock_lock); return true; } static void vxlan_sock_release(struct vxlan_dev *vxlan) { struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); #if IS_ENABLED(CONFIG_IPV6) struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); RCU_INIT_POINTER(vxlan->vn6_sock, NULL); #endif RCU_INIT_POINTER(vxlan->vn4_sock, NULL); synchronize_net(); if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vs_del_vnigrp(vxlan); else vxlan_vs_del_dev(vxlan); if (__vxlan_sock_release_prep(sock4)) { udp_tunnel_sock_release(sock4->sock); kfree(sock4); } #if IS_ENABLED(CONFIG_IPV6) if (__vxlan_sock_release_prep(sock6)) { udp_tunnel_sock_release(sock6->sock); kfree(sock6); } #endif } static enum skb_drop_reason vxlan_remcsum(struct sk_buff *skb, u32 vxflags) { const struct vxlanhdr *vh = vxlan_hdr(skb); enum skb_drop_reason reason; size_t start, offset; if (!(vh->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) return SKB_NOT_DROPPED_YET; start = vxlan_rco_start(vh->vx_vni); offset = start + vxlan_rco_offset(vh->vx_vni); reason = pskb_may_pull_reason(skb, offset + sizeof(u16)); if (reason) return reason; skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset, !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL)); return SKB_NOT_DROPPED_YET; } static void vxlan_parse_gbp_hdr(struct sk_buff *skb, u32 vxflags, struct vxlan_metadata *md) { const struct vxlanhdr *vh = vxlan_hdr(skb); const struct vxlanhdr_gbp *gbp; struct metadata_dst *tun_dst; gbp = (const struct vxlanhdr_gbp *)vh; if (!(vh->vx_flags & VXLAN_HF_GBP)) return; md->gbp = ntohs(gbp->policy_id); tun_dst = (struct metadata_dst *)skb_dst(skb); if (tun_dst) { __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_dst->u.tun_info.key.tun_flags); tun_dst->u.tun_info.options_len = sizeof(*md); } if (gbp->dont_learn) md->gbp |= VXLAN_GBP_DONT_LEARN; if (gbp->policy_applied) md->gbp |= VXLAN_GBP_POLICY_APPLIED; /* In flow-based mode, GBP is carried in dst_metadata */ if (!(vxflags & VXLAN_F_COLLECT_METADATA)) skb->mark = md->gbp; } static enum skb_drop_reason vxlan_set_mac(struct vxlan_dev *vxlan, struct vxlan_sock *vs, struct sk_buff *skb, __be32 vni) { union vxlan_addr saddr; u32 ifindex = skb->dev->ifindex; skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, vxlan->dev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); /* Ignore packet loops (and multicast echo) */ if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) return SKB_DROP_REASON_LOCAL_MAC; /* Get address from the outer IP header */ if (vxlan_get_sk_family(vs) == AF_INET) { saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; saddr.sa.sa_family = AF_INET; #if IS_ENABLED(CONFIG_IPV6) } else { saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr; saddr.sa.sa_family = AF_INET6; #endif } if (!(vxlan->cfg.flags & VXLAN_F_LEARN)) return SKB_NOT_DROPPED_YET; return vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni); } static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, struct sk_buff *skb) { int err = 0; if (vxlan_get_sk_family(vs) == AF_INET) err = IP_ECN_decapsulate(oiph, skb); #if IS_ENABLED(CONFIG_IPV6) else err = IP6_ECN_decapsulate(oiph, skb); #endif if (unlikely(err) && log_ecn_error) { if (vxlan_get_sk_family(vs) == AF_INET) net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", &((struct iphdr *)oiph)->saddr, ((struct iphdr *)oiph)->tos); else net_info_ratelimited("non-ECT from %pI6\n", &((struct ipv6hdr *)oiph)->saddr); } return err <= 1; } /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) { struct vxlan_vni_node *vninode = NULL; const struct vxlanhdr *vh; struct vxlan_dev *vxlan; struct vxlan_sock *vs; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; __be16 protocol = htons(ETH_P_TEB); enum skb_drop_reason reason; bool raw_proto = false; void *oiph; __be32 vni = 0; int nh; /* Need UDP and VXLAN header to be present */ reason = pskb_may_pull_reason(skb, VXLAN_HLEN); if (reason) goto drop; vh = vxlan_hdr(skb); /* VNI flag always required to be set */ if (!(vh->vx_flags & VXLAN_HF_VNI)) { netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", ntohl(vh->vx_flags), ntohl(vh->vx_vni)); reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; /* Return non vxlan pkt */ goto drop; } vs = rcu_dereference_sk_user_data(sk); if (!vs) goto drop; vni = vxlan_vni(vh->vx_vni); vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode); if (!vxlan) { reason = SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND; goto drop; } if (vh->vx_flags & vxlan->cfg.reserved_bits.vx_flags || vh->vx_vni & vxlan->cfg.reserved_bits.vx_vni) { /* If the header uses bits besides those enabled by the * netdevice configuration, treat this as a malformed packet. * This behavior diverges from VXLAN RFC (RFC7348) which * stipulates that bits in reserved in reserved fields are to be * ignored. The approach here maintains compatibility with * previous stack code, and also is more robust and provides a * little more security in adding extensions to VXLAN. */ reason = SKB_DROP_REASON_VXLAN_INVALID_HDR; DEV_STATS_INC(vxlan->dev, rx_frame_errors); DEV_STATS_INC(vxlan->dev, rx_errors); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; } if (vxlan->cfg.flags & VXLAN_F_GPE) { if (!vxlan_parse_gpe_proto(vh, &protocol)) goto drop; raw_proto = true; } if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto, !net_eq(vxlan->net, dev_net(vxlan->dev)))) { reason = SKB_DROP_REASON_NOMEM; goto drop; } if (vxlan->cfg.flags & VXLAN_F_REMCSUM_RX) { reason = vxlan_remcsum(skb, vxlan->cfg.flags); if (unlikely(reason)) goto drop; } if (vxlan_collect_metadata(vs)) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tun_dst; __set_bit(IP_TUNNEL_KEY_BIT, flags); tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), flags, key32_to_tunnel_id(vni), sizeof(*md)); if (!tun_dst) { reason = SKB_DROP_REASON_NOMEM; goto drop; } md = ip_tunnel_info_opts(&tun_dst->u.tun_info); skb_dst_set(skb, (struct dst_entry *)tun_dst); } else { memset(md, 0, sizeof(*md)); } if (vxlan->cfg.flags & VXLAN_F_GBP) vxlan_parse_gbp_hdr(skb, vxlan->cfg.flags, md); /* Note that GBP and GPE can never be active together. This is * ensured in vxlan_dev_configure. */ if (!raw_proto) { reason = vxlan_set_mac(vxlan, vs, skb, vni); if (reason) goto drop; } else { skb_reset_mac_header(skb); skb->dev = vxlan->dev; skb->pkt_type = PACKET_HOST; } /* Save offset of outer header relative to skb->head, * because we are going to reset the network header to the inner header * and might change skb->head. */ nh = skb_network_header(skb) - skb->head; skb_reset_network_header(skb); reason = pskb_inet_may_pull_reason(skb); if (reason) { DEV_STATS_INC(vxlan->dev, rx_length_errors); DEV_STATS_INC(vxlan->dev, rx_errors); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; } /* Get the outer header. */ oiph = skb->head + nh; if (!vxlan_ecn_decapsulate(vs, oiph, skb)) { reason = SKB_DROP_REASON_IP_TUNNEL_ECN; DEV_STATS_INC(vxlan->dev, rx_frame_errors); DEV_STATS_INC(vxlan->dev, rx_errors); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_ERRORS, 0); goto drop; } rcu_read_lock(); if (unlikely(!(vxlan->dev->flags & IFF_UP))) { rcu_read_unlock(); dev_dstats_rx_dropped(vxlan->dev); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX_DROPS, 0); reason = SKB_DROP_REASON_DEV_READY; goto drop; } dev_dstats_rx_add(vxlan->dev, skb->len); vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX, skb->len); gro_cells_receive(&vxlan->gro_cells, skb); rcu_read_unlock(); return 0; drop: reason = reason ?: SKB_DROP_REASON_NOT_SPECIFIED; /* Consume bad packet */ kfree_skb_reason(skb, reason); return 0; } /* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */ static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb) { struct vxlan_dev *vxlan; struct vxlan_sock *vs; struct vxlanhdr *hdr; __be32 vni; if (!pskb_may_pull(skb, skb_transport_offset(skb) + VXLAN_HLEN)) return -EINVAL; hdr = vxlan_hdr(skb); if (!(hdr->vx_flags & VXLAN_HF_VNI)) return -EINVAL; vs = rcu_dereference_sk_user_data(sk); if (!vs) return -ENOENT; vni = vxlan_vni(hdr->vx_vni); vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, NULL); if (!vxlan) return -ENOENT; return 0; } static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct arphdr *parp; u8 *arpptr, *sha; __be32 sip, tip; struct neighbour *n; if (dev->flags & IFF_NOARP) goto out; if (!pskb_may_pull(skb, arp_hdr_len(dev))) { dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); goto out; } parp = arp_hdr(skb); if ((parp->ar_hrd != htons(ARPHRD_ETHER) && parp->ar_hrd != htons(ARPHRD_IEEE802)) || parp->ar_pro != htons(ETH_P_IP) || parp->ar_op != htons(ARPOP_REQUEST) || parp->ar_hln != dev->addr_len || parp->ar_pln != 4) goto out; arpptr = (u8 *)parp + sizeof(struct arphdr); sha = arpptr; arpptr += dev->addr_len; /* sha */ memcpy(&sip, arpptr, sizeof(sip)); arpptr += sizeof(sip); arpptr += dev->addr_len; /* tha */ memcpy(&tip, arpptr, sizeof(tip)); if (ipv4_is_loopback(tip) || ipv4_is_multicast(tip)) goto out; n = neigh_lookup(&arp_tbl, &tip, dev); if (n) { struct vxlan_fdb *f; struct sk_buff *reply; if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) { neigh_release(n); goto out; } f = vxlan_find_mac(vxlan, n->ha, vni); if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); goto out; } reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, n->ha, sha); neigh_release(n); if (reply == NULL) goto out; skb_reset_mac_header(reply); __skb_pull(reply, skb_network_offset(reply)); reply->ip_summed = CHECKSUM_UNNECESSARY; reply->pkt_type = PACKET_HOST; if (netif_rx(reply) == NET_RX_DROP) { dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) { union vxlan_addr ipa = { .sin.sin_addr.s_addr = tip, .sin.sin_family = AF_INET, }; vxlan_ip_miss(dev, &ipa); } out: consume_skb(skb); return NETDEV_TX_OK; } #if IS_ENABLED(CONFIG_IPV6) static struct sk_buff *vxlan_na_create(struct sk_buff *request, struct neighbour *n, bool isrouter) { struct net_device *dev = request->dev; struct sk_buff *reply; struct nd_msg *ns, *na; struct ipv6hdr *pip6; u8 *daddr; int na_olen = 8; /* opt hdr + ETH_ALEN for target */ int ns_olen; int i, len; if (dev == NULL || !pskb_may_pull(request, request->len)) return NULL; len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + sizeof(*na) + na_olen + dev->needed_tailroom; reply = alloc_skb(len, GFP_ATOMIC); if (reply == NULL) return NULL; reply->protocol = htons(ETH_P_IPV6); reply->dev = dev; skb_reserve(reply, LL_RESERVED_SPACE(request->dev)); skb_push(reply, sizeof(struct ethhdr)); skb_reset_mac_header(reply); ns = (struct nd_msg *)(ipv6_hdr(request) + 1); daddr = eth_hdr(request)->h_source; ns_olen = request->len - skb_network_offset(request) - sizeof(struct ipv6hdr) - sizeof(*ns); for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) { if (!ns->opt[i + 1]) { kfree_skb(reply); return NULL; } if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { daddr = ns->opt + i + sizeof(struct nd_opt_hdr); break; } } /* Ethernet header */ ether_addr_copy(eth_hdr(reply)->h_dest, daddr); ether_addr_copy(eth_hdr(reply)->h_source, n->ha); eth_hdr(reply)->h_proto = htons(ETH_P_IPV6); reply->protocol = htons(ETH_P_IPV6); skb_pull(reply, sizeof(struct ethhdr)); skb_reset_network_header(reply); skb_put(reply, sizeof(struct ipv6hdr)); /* IPv6 header */ pip6 = ipv6_hdr(reply); memset(pip6, 0, sizeof(struct ipv6hdr)); pip6->version = 6; pip6->priority = ipv6_hdr(request)->priority; pip6->nexthdr = IPPROTO_ICMPV6; pip6->hop_limit = 255; pip6->daddr = ipv6_hdr(request)->saddr; pip6->saddr = *(struct in6_addr *)n->primary_key; skb_pull(reply, sizeof(struct ipv6hdr)); skb_reset_transport_header(reply); /* Neighbor Advertisement */ na = skb_put_zero(reply, sizeof(*na) + na_olen); na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; na->icmph.icmp6_router = isrouter; na->icmph.icmp6_override = 1; na->icmph.icmp6_solicited = 1; na->target = ns->target; ether_addr_copy(&na->opt[2], n->ha); na->opt[0] = ND_OPT_TARGET_LL_ADDR; na->opt[1] = na_olen >> 3; na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6, csum_partial(na, sizeof(*na)+na_olen, 0)); pip6->payload_len = htons(sizeof(*na)+na_olen); skb_push(reply, sizeof(struct ipv6hdr)); reply->ip_summed = CHECKSUM_UNNECESSARY; return reply; } static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); const struct in6_addr *daddr; const struct ipv6hdr *iphdr; struct inet6_dev *in6_dev; struct neighbour *n; struct nd_msg *msg; rcu_read_lock(); in6_dev = __in6_dev_get(dev); if (!in6_dev) goto out; iphdr = ipv6_hdr(skb); daddr = &iphdr->daddr; msg = (struct nd_msg *)(iphdr + 1); if (ipv6_addr_loopback(daddr) || ipv6_addr_is_multicast(&msg->target)) goto out; n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev); if (n) { struct vxlan_fdb *f; struct sk_buff *reply; if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) { neigh_release(n); goto out; } f = vxlan_find_mac(vxlan, n->ha, vni); if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { /* bridge-local neighbor */ neigh_release(n); goto out; } reply = vxlan_na_create(skb, n, !!(f ? f->flags & NTF_ROUTER : 0)); neigh_release(n); if (reply == NULL) goto out; if (netif_rx(reply) == NET_RX_DROP) { dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) { union vxlan_addr ipa = { .sin6.sin6_addr = msg->target, .sin6.sin6_family = AF_INET6, }; vxlan_ip_miss(dev, &ipa); } out: rcu_read_unlock(); consume_skb(skb); return NETDEV_TX_OK; } #endif static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) { struct vxlan_dev *vxlan = netdev_priv(dev); struct neighbour *n; if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) return false; n = NULL; switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: { struct iphdr *pip; if (!pskb_may_pull(skb, sizeof(struct iphdr))) return false; pip = ip_hdr(skb); n = neigh_lookup(&arp_tbl, &pip->daddr, dev); if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) { union vxlan_addr ipa = { .sin.sin_addr.s_addr = pip->daddr, .sin.sin_family = AF_INET, }; vxlan_ip_miss(dev, &ipa); return false; } break; } #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: { struct ipv6hdr *pip6; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) return false; pip6 = ipv6_hdr(skb); n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev); if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) { union vxlan_addr ipa = { .sin6.sin6_addr = pip6->daddr, .sin6.sin6_family = AF_INET6, }; vxlan_ip_miss(dev, &ipa); return false; } break; } #endif default: return false; } if (n) { bool diff; diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha); if (diff) { memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, dev->addr_len); memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len); } neigh_release(n); return diff; } return false; } static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, __be16 protocol) { struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh; gpe->np_applied = 1; gpe->next_protocol = tun_p_from_eth_p(protocol); if (!gpe->next_protocol) return -EPFNOSUPPORT; return 0; } static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, int iphdr_len, __be32 vni, struct vxlan_metadata *md, u32 vxflags, bool udp_sum) { struct vxlanhdr *vxh; int min_headroom; int err; int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; __be16 inner_protocol = htons(ETH_P_TEB); if ((vxflags & VXLAN_F_REMCSUM_TX) && skb->ip_summed == CHECKSUM_PARTIAL) { int csum_start = skb_checksum_start_offset(skb); if (csum_start <= VXLAN_MAX_REMCSUM_START && !(csum_start & VXLAN_RCO_SHIFT_MASK) && (skb->csum_offset == offsetof(struct udphdr, check) || skb->csum_offset == offsetof(struct tcphdr, check))) type |= SKB_GSO_TUNNEL_REMCSUM; } min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len + VXLAN_HLEN + iphdr_len; /* Need space for new headers (invalidates iph ptr) */ err = skb_cow_head(skb, min_headroom); if (unlikely(err)) return err; err = iptunnel_handle_offloads(skb, type); if (err) return err; vxh = __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = VXLAN_HF_VNI; vxh->vx_vni = vxlan_vni_field(vni); if (type & SKB_GSO_TUNNEL_REMCSUM) { unsigned int start; start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr); vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset); vxh->vx_flags |= VXLAN_HF_RCO; if (!skb_is_gso(skb)) { skb->ip_summed = CHECKSUM_NONE; skb->encapsulation = 0; } } if (vxflags & VXLAN_F_GBP) vxlan_build_gbp_hdr(vxh, md); if (vxflags & VXLAN_F_GPE) { err = vxlan_build_gpe_hdr(vxh, skb->protocol); if (err < 0) return err; inner_protocol = skb->protocol; } skb_set_inner_protocol(skb, inner_protocol); return 0; } /* Bypass encapsulation if the destination is local */ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, struct vxlan_dev *dst_vxlan, __be32 vni, bool snoop) { union vxlan_addr loopback; union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; unsigned int len = skb->len; struct net_device *dev; skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; skb->dev = dst_vxlan->dev; __skb_pull(skb, skb_network_offset(skb)); if (remote_ip->sa.sa_family == AF_INET) { loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); loopback.sa.sa_family = AF_INET; #if IS_ENABLED(CONFIG_IPV6) } else { loopback.sin6.sin6_addr = in6addr_loopback; loopback.sa.sa_family = AF_INET6; #endif } rcu_read_lock(); dev = skb->dev; if (unlikely(!(dev->flags & IFF_UP))) { kfree_skb_reason(skb, SKB_DROP_REASON_DEV_READY); goto drop; } if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop) vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni); dev_dstats_tx_add(src_vxlan->dev, len); vxlan_vnifilter_count(src_vxlan, vni, NULL, VXLAN_VNI_STATS_TX, len); if (__netif_rx(skb) == NET_RX_SUCCESS) { dev_dstats_rx_add(dst_vxlan->dev, len); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX, len); } else { drop: dev_dstats_rx_dropped(dev); vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX_DROPS, 0); } rcu_read_unlock(); } static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, struct vxlan_dev *vxlan, int addr_family, __be16 dst_port, int dst_ifindex, __be32 vni, struct dst_entry *dst, u32 rt_flags) { #if IS_ENABLED(CONFIG_IPV6) /* IPv6 rt-flags are checked against RTF_LOCAL, but the value of * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry. */ BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL); #endif /* Bypass encapsulation if the destination is local */ if (rt_flags & RTCF_LOCAL && !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && vxlan->cfg.flags & VXLAN_F_LOCALBYPASS) { struct vxlan_dev *dst_vxlan; dst_release(dst); dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni, addr_family, dst_port, vxlan->cfg.flags); if (!dst_vxlan) { DEV_STATS_INC(dev, tx_errors); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb_reason(skb, SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND); return -ENOENT; } vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true); return 1; } return 0; } void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, __be32 default_vni, struct vxlan_rdst *rdst, bool did_rsc) { struct dst_cache *dst_cache; struct ip_tunnel_info *info; struct ip_tunnel_key *pkey; struct ip_tunnel_key key; struct vxlan_dev *vxlan = netdev_priv(dev); const struct iphdr *old_iph; struct vxlan_metadata _md; struct vxlan_metadata *md = &_md; unsigned int pkt_len = skb->len; __be16 src_port = 0, dst_port; struct dst_entry *ndst = NULL; int addr_family; __u8 tos, ttl; int ifindex; int err; u32 flags = vxlan->cfg.flags; bool use_cache; bool udp_sum = false; bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); enum skb_drop_reason reason; bool no_eth_encap; __be32 vni = 0; no_eth_encap = flags & VXLAN_F_GPE && skb->protocol != htons(ETH_P_TEB); reason = skb_vlan_inet_prepare(skb, no_eth_encap); if (reason) goto drop; reason = SKB_DROP_REASON_NOT_SPECIFIED; old_iph = ip_hdr(skb); info = skb_tunnel_info(skb); use_cache = ip_tunnel_dst_cache_usable(skb, info); if (rdst) { memset(&key, 0, sizeof(key)); pkey = &key; if (vxlan_addr_any(&rdst->remote_ip)) { if (did_rsc) { /* short-circuited back to local bridge */ vxlan_encap_bypass(skb, vxlan, vxlan, default_vni, true); return; } goto drop; } addr_family = vxlan->cfg.saddr.sa.sa_family; dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; vni = (rdst->remote_vni) ? : default_vni; ifindex = rdst->remote_ifindex; if (addr_family == AF_INET) { key.u.ipv4.src = vxlan->cfg.saddr.sin.sin_addr.s_addr; key.u.ipv4.dst = rdst->remote_ip.sin.sin_addr.s_addr; } else { key.u.ipv6.src = vxlan->cfg.saddr.sin6.sin6_addr; key.u.ipv6.dst = rdst->remote_ip.sin6.sin6_addr; } dst_cache = &rdst->dst_cache; md->gbp = skb->mark; if (flags & VXLAN_F_TTL_INHERIT) { ttl = ip_tunnel_get_ttl(old_iph, skb); } else { ttl = vxlan->cfg.ttl; if (!ttl && vxlan_addr_multicast(&rdst->remote_ip)) ttl = 1; } tos = vxlan->cfg.tos; if (tos == 1) tos = ip_tunnel_get_dsfield(old_iph, skb); if (tos && !info) use_cache = false; if (addr_family == AF_INET) udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX); else udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); #if IS_ENABLED(CONFIG_IPV6) switch (vxlan->cfg.label_policy) { case VXLAN_LABEL_FIXED: key.label = vxlan->cfg.label; break; case VXLAN_LABEL_INHERIT: key.label = ip_tunnel_get_flowlabel(old_iph, skb); break; default: DEBUG_NET_WARN_ON_ONCE(1); goto drop; } #endif } else { if (!info) { WARN_ONCE(1, "%s: Missing encapsulation instructions\n", dev->name); goto drop; } pkey = &info->key; addr_family = ip_tunnel_info_af(info); dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; vni = tunnel_id_to_key32(info->key.tun_id); ifindex = 0; dst_cache = &info->dst_cache; if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) { if (info->options_len < sizeof(*md)) goto drop; md = ip_tunnel_info_opts(info); } ttl = info->key.ttl; tos = info->key.tos; udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags); } src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); rcu_read_lock(); if (addr_family == AF_INET) { struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); struct rtable *rt; __be16 df = 0; __be32 saddr; if (!ifindex) ifindex = sock4->sock->sk->sk_bound_dev_if; rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, ifindex, &saddr, pkey, src_port, dst_port, tos, use_cache ? dst_cache : NULL); if (IS_ERR(rt)) { err = PTR_ERR(rt); reason = SKB_DROP_REASON_IP_OUTNOROUTES; goto tx_error; } if (!info) { /* Bypass encapsulation if the destination is local */ err = encap_bypass_if_local(skb, dev, vxlan, AF_INET, dst_port, ifindex, vni, &rt->dst, rt->rt_flags); if (err) goto out_unlock; if (vxlan->cfg.df == VXLAN_DF_SET) { df = htons(IP_DF); } else if (vxlan->cfg.df == VXLAN_DF_INHERIT) { struct ethhdr *eth = eth_hdr(skb); if (ntohs(eth->h_proto) == ETH_P_IPV6 || (ntohs(eth->h_proto) == ETH_P_IP && old_iph->frag_off & htons(IP_DF))) df = htons(IP_DF); } } else if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags)) { df = htons(IP_DF); } ndst = &rt->dst; err = skb_tunnel_check_pmtu(skb, ndst, vxlan_headroom(flags & VXLAN_F_GPE), netif_is_any_bridge_port(dev)); if (err < 0) { goto tx_error; } else if (err) { if (info) { struct ip_tunnel_info *unclone; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) goto tx_error; unclone->key.u.ipv4.src = pkey->u.ipv4.dst; unclone->key.u.ipv4.dst = saddr; } vxlan_encap_bypass(skb, vxlan, vxlan, vni, false); dst_release(ndst); goto out_unlock; } tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr), vni, md, flags, udp_sum); if (err < 0) { reason = SKB_DROP_REASON_NOMEM; goto tx_error; } udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr, pkey->u.ipv4.dst, tos, ttl, df, src_port, dst_port, xnet, !udp_sum); #if IS_ENABLED(CONFIG_IPV6) } else { struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct in6_addr saddr; if (!ifindex) ifindex = sock6->sock->sk->sk_bound_dev_if; ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock, ifindex, &saddr, pkey, src_port, dst_port, tos, use_cache ? dst_cache : NULL); if (IS_ERR(ndst)) { err = PTR_ERR(ndst); ndst = NULL; reason = SKB_DROP_REASON_IP_OUTNOROUTES; goto tx_error; } if (!info) { u32 rt6i_flags = dst_rt6_info(ndst)->rt6i_flags; err = encap_bypass_if_local(skb, dev, vxlan, AF_INET6, dst_port, ifindex, vni, ndst, rt6i_flags); if (err) goto out_unlock; } err = skb_tunnel_check_pmtu(skb, ndst, vxlan_headroom((flags & VXLAN_F_GPE) | VXLAN_F_IPV6), netif_is_any_bridge_port(dev)); if (err < 0) { goto tx_error; } else if (err) { if (info) { struct ip_tunnel_info *unclone; unclone = skb_tunnel_info_unclone(skb); if (unlikely(!unclone)) goto tx_error; unclone->key.u.ipv6.src = pkey->u.ipv6.dst; unclone->key.u.ipv6.dst = saddr; } vxlan_encap_bypass(skb, vxlan, vxlan, vni, false); dst_release(ndst); goto out_unlock; } tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip6_dst_hoplimit(ndst); skb_scrub_packet(skb, xnet); err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), vni, md, flags, udp_sum); if (err < 0) { reason = SKB_DROP_REASON_NOMEM; goto tx_error; } udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev, &saddr, &pkey->u.ipv6.dst, tos, ttl, pkey->label, src_port, dst_port, !udp_sum); #endif } vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len); out_unlock: rcu_read_unlock(); return; drop: dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb_reason(skb, reason); return; tx_error: rcu_read_unlock(); if (err == -ELOOP) DEV_STATS_INC(dev, collisions); else if (err == -ENETUNREACH) DEV_STATS_INC(dev, tx_carrier_errors); dst_release(ndst); DEV_STATS_INC(dev, tx_errors); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0); kfree_skb_reason(skb, reason); } static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, struct vxlan_fdb *f, __be32 vni, bool did_rsc) { struct vxlan_rdst nh_rdst; struct nexthop *nh; bool do_xmit; u32 hash; memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); hash = skb_get_hash(skb); rcu_read_lock(); nh = rcu_dereference(f->nh); if (!nh) { rcu_read_unlock(); goto drop; } do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); rcu_read_unlock(); if (likely(do_xmit)) vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc); else goto drop; return; drop: dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); } static netdev_tx_t vxlan_xmit_nhid(struct sk_buff *skb, struct net_device *dev, u32 nhid, __be32 vni) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst nh_rdst; struct nexthop *nh; bool do_xmit; u32 hash; memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); hash = skb_get_hash(skb); rcu_read_lock(); nh = nexthop_find_by_id(dev_net(dev), nhid); if (unlikely(!nh || !nexthop_is_fdb(nh) || !nexthop_is_multipath(nh))) { rcu_read_unlock(); goto drop; } do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); rcu_read_unlock(); if (vxlan->cfg.saddr.sa.sa_family != nh_rdst.remote_ip.sa.sa_family) goto drop; if (likely(do_xmit)) vxlan_xmit_one(skb, dev, vni, &nh_rdst, false); else goto drop; return NETDEV_TX_OK; drop: dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(netdev_priv(dev), vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); dev_kfree_skb(skb); return NETDEV_TX_OK; } /* Transmit local packets over Vxlan * * Outer IP header inherits ECN and DF from inner header. * Outer UDP destination is the VXLAN assigned port. * source port is based on hash of flow */ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst, *fdst = NULL; const struct ip_tunnel_info *info; struct vxlan_fdb *f; struct ethhdr *eth; __be32 vni = 0; u32 nhid = 0; bool did_rsc; info = skb_tunnel_info(skb); skb_reset_mac_header(skb); if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { if (info && info->mode & IP_TUNNEL_INFO_BRIDGE && info->mode & IP_TUNNEL_INFO_TX) { vni = tunnel_id_to_key32(info->key.tun_id); nhid = info->key.nhid; } else { if (info && info->mode & IP_TUNNEL_INFO_TX) vxlan_xmit_one(skb, dev, vni, NULL, false); else kfree_skb_reason(skb, SKB_DROP_REASON_TUNNEL_TXINFO); return NETDEV_TX_OK; } } if (vxlan->cfg.flags & VXLAN_F_PROXY) { eth = eth_hdr(skb); if (ntohs(eth->h_proto) == ETH_P_ARP) return arp_reduce(dev, skb, vni); #if IS_ENABLED(CONFIG_IPV6) else if (ntohs(eth->h_proto) == ETH_P_IPV6 && pskb_may_pull(skb, sizeof(struct ipv6hdr) + sizeof(struct nd_msg)) && ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1); if (m->icmph.icmp6_code == 0 && m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) return neigh_reduce(dev, skb, vni); } #endif } if (nhid) return vxlan_xmit_nhid(skb, dev, nhid, vni); if (vxlan->cfg.flags & VXLAN_F_MDB) { struct vxlan_mdb_entry *mdb_entry; rcu_read_lock(); mdb_entry = vxlan_mdb_entry_skb_get(vxlan, skb, vni); if (mdb_entry) { netdev_tx_t ret; ret = vxlan_mdb_xmit(vxlan, mdb_entry, skb); rcu_read_unlock(); return ret; } rcu_read_unlock(); } eth = eth_hdr(skb); f = vxlan_find_mac(vxlan, eth->h_dest, vni); did_rsc = false; if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) && (ntohs(eth->h_proto) == ETH_P_IP || ntohs(eth->h_proto) == ETH_P_IPV6)) { did_rsc = route_shortcircuit(dev, skb); if (did_rsc) f = vxlan_find_mac(vxlan, eth->h_dest, vni); } if (f == NULL) { f = vxlan_find_mac(vxlan, all_zeros_mac, vni); if (f == NULL) { if ((vxlan->cfg.flags & VXLAN_F_L2MISS) && !is_multicast_ether_addr(eth->h_dest)) vxlan_fdb_miss(vxlan, eth->h_dest); dev_dstats_tx_dropped(dev); vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0); kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); return NETDEV_TX_OK; } } if (rcu_access_pointer(f->nh)) { vxlan_xmit_nh(skb, dev, f, (vni ? : vxlan->default_dst.remote_vni), did_rsc); } else { list_for_each_entry_rcu(rdst, &f->remotes, list) { struct sk_buff *skb1; if (!fdst) { fdst = rdst; continue; } skb1 = skb_clone(skb, GFP_ATOMIC); if (skb1) vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc); } if (fdst) vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); else kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET); } return NETDEV_TX_OK; } /* Walk the forwarding table and purge stale entries */ static void vxlan_cleanup(struct timer_list *t) { struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer); unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; unsigned int h; if (!netif_running(vxlan->dev)) return; for (h = 0; h < FDB_HASH_SIZE; ++h) { struct hlist_node *p, *n; spin_lock(&vxlan->hash_lock[h]); hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); unsigned long timeout; if (f->state & (NUD_PERMANENT | NUD_NOARP)) continue; if (f->flags & NTF_EXT_LEARNED) continue; timeout = f->used + vxlan->cfg.age_interval * HZ; if (time_before_eq(timeout, jiffies)) { netdev_dbg(vxlan->dev, "garbage collect %pM\n", f->eth_addr); f->state = NUD_STALE; vxlan_fdb_destroy(vxlan, f, true, true); } else if (time_before(timeout, next_timer)) next_timer = timeout; } spin_unlock(&vxlan->hash_lock[h]); } mod_timer(&vxlan->age_timer, next_timer); } static void vxlan_vs_del_dev(struct vxlan_dev *vxlan) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); spin_lock(&vn->sock_lock); hlist_del_init_rcu(&vxlan->hlist4.hlist); #if IS_ENABLED(CONFIG_IPV6) hlist_del_init_rcu(&vxlan->hlist6.hlist); #endif spin_unlock(&vn->sock_lock); } static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan, struct vxlan_dev_node *node) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); __be32 vni = vxlan->default_dst.remote_vni; node->vxlan = vxlan; spin_lock(&vn->sock_lock); hlist_add_head_rcu(&node->hlist, vni_head(vs, vni)); spin_unlock(&vn->sock_lock); } /* Setup stats when device is created */ static int vxlan_init(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); int err; if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { err = vxlan_vnigroup_init(vxlan); if (err) return err; } err = gro_cells_init(&vxlan->gro_cells, dev); if (err) goto err_vnigroup_uninit; err = vxlan_mdb_init(vxlan); if (err) goto err_gro_cells_destroy; netdev_lockdep_set_classes(dev); return 0; err_gro_cells_destroy: gro_cells_destroy(&vxlan->gro_cells); err_vnigroup_uninit: if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_uninit(vxlan); return err; } static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni) { struct vxlan_fdb *f; u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); f = __vxlan_find_mac(vxlan, all_zeros_mac, vni); if (f) vxlan_fdb_destroy(vxlan, f, true, true); spin_unlock_bh(&vxlan->hash_lock[hash_index]); } static void vxlan_uninit(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); vxlan_mdb_fini(vxlan); if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) vxlan_vnigroup_uninit(vxlan); gro_cells_destroy(&vxlan->gro_cells); vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni); } /* Start ageing timer and join group when device is brought up */ static int vxlan_open(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); int ret; ret = vxlan_sock_add(vxlan); if (ret < 0) return ret; ret = vxlan_multicast_join(vxlan); if (ret) { vxlan_sock_release(vxlan); return ret; } if (vxlan->cfg.age_interval) mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); return ret; } struct vxlan_fdb_flush_desc { bool ignore_default_entry; unsigned long state; unsigned long state_mask; unsigned long flags; unsigned long flags_mask; __be32 src_vni; u32 nhid; __be32 vni; __be16 port; union vxlan_addr dst_ip; }; static bool vxlan_fdb_is_default_entry(const struct vxlan_fdb *f, const struct vxlan_dev *vxlan) { return is_zero_ether_addr(f->eth_addr) && f->vni == vxlan->cfg.vni; } static bool vxlan_fdb_nhid_matches(const struct vxlan_fdb *f, u32 nhid) { struct nexthop *nh = rtnl_dereference(f->nh); return nh && nh->id == nhid; } static bool vxlan_fdb_flush_matches(const struct vxlan_fdb *f, const struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc) { if (desc->state_mask && (f->state & desc->state_mask) != desc->state) return false; if (desc->flags_mask && (f->flags & desc->flags_mask) != desc->flags) return false; if (desc->ignore_default_entry && vxlan_fdb_is_default_entry(f, vxlan)) return false; if (desc->src_vni && f->vni != desc->src_vni) return false; if (desc->nhid && !vxlan_fdb_nhid_matches(f, desc->nhid)) return false; return true; } static bool vxlan_fdb_flush_should_match_remotes(const struct vxlan_fdb_flush_desc *desc) { return desc->vni || desc->port || desc->dst_ip.sa.sa_family; } static bool vxlan_fdb_flush_remote_matches(const struct vxlan_fdb_flush_desc *desc, const struct vxlan_rdst *rd) { if (desc->vni && rd->remote_vni != desc->vni) return false; if (desc->port && rd->remote_port != desc->port) return false; if (desc->dst_ip.sa.sa_family && !vxlan_addr_equal(&rd->remote_ip, &desc->dst_ip)) return false; return true; } static void vxlan_fdb_flush_match_remotes(struct vxlan_fdb *f, struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc, bool *p_destroy_fdb) { bool remotes_flushed = false; struct vxlan_rdst *rd, *tmp; list_for_each_entry_safe(rd, tmp, &f->remotes, list) { if (!vxlan_fdb_flush_remote_matches(desc, rd)) continue; vxlan_fdb_dst_destroy(vxlan, f, rd, true); remotes_flushed = true; } *p_destroy_fdb = remotes_flushed && list_empty(&f->remotes); } /* Purge the forwarding table */ static void vxlan_flush(struct vxlan_dev *vxlan, const struct vxlan_fdb_flush_desc *desc) { bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc); unsigned int h; for (h = 0; h < FDB_HASH_SIZE; ++h) { struct hlist_node *p, *n; spin_lock_bh(&vxlan->hash_lock[h]); hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { struct vxlan_fdb *f = container_of(p, struct vxlan_fdb, hlist); if (!vxlan_fdb_flush_matches(f, vxlan, desc)) continue; if (match_remotes) { bool destroy_fdb = false; vxlan_fdb_flush_match_remotes(f, vxlan, desc, &destroy_fdb); if (!destroy_fdb) continue; } vxlan_fdb_destroy(vxlan, f, true, true); } spin_unlock_bh(&vxlan->hash_lock[h]); } } static const struct nla_policy vxlan_del_bulk_policy[NDA_MAX + 1] = { [NDA_SRC_VNI] = { .type = NLA_U32 }, [NDA_NH_ID] = { .type = NLA_U32 }, [NDA_VNI] = { .type = NLA_U32 }, [NDA_PORT] = { .type = NLA_U16 }, [NDA_DST] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), [NDA_NDM_STATE_MASK] = { .type = NLA_U16 }, [NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 }, }; #define VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS (NTF_MASTER | NTF_SELF) #define VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES (NUD_PERMANENT | NUD_NOARP) #define VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS (NTF_EXT_LEARNED | NTF_OFFLOADED | \ NTF_ROUTER) static int vxlan_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = {}; struct ndmsg *ndm = nlmsg_data(nlh); struct nlattr *tb[NDA_MAX + 1]; u8 ndm_flags; int err; ndm_flags = ndm->ndm_flags & ~VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS; err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, vxlan_del_bulk_policy, extack); if (err) return err; if (ndm_flags & ~VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS) { NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm flag bits set"); return -EINVAL; } if (ndm->ndm_state & ~VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES) { NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm state bits set"); return -EINVAL; } desc.state = ndm->ndm_state; desc.flags = ndm_flags; if (tb[NDA_NDM_STATE_MASK]) desc.state_mask = nla_get_u16(tb[NDA_NDM_STATE_MASK]); if (tb[NDA_NDM_FLAGS_MASK]) desc.flags_mask = nla_get_u8(tb[NDA_NDM_FLAGS_MASK]); if (tb[NDA_SRC_VNI]) desc.src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI])); if (tb[NDA_NH_ID]) desc.nhid = nla_get_u32(tb[NDA_NH_ID]); if (tb[NDA_VNI]) desc.vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); if (tb[NDA_PORT]) desc.port = nla_get_be16(tb[NDA_PORT]); if (tb[NDA_DST]) { union vxlan_addr ip; err = vxlan_nla_get_addr(&ip, tb[NDA_DST]); if (err) { NL_SET_ERR_MSG_ATTR(extack, tb[NDA_DST], "Unsupported address family"); return err; } desc.dst_ip = ip; } vxlan_flush(vxlan, &desc); return 0; } /* Cleanup timer and forwarding table on shutdown */ static int vxlan_stop(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = { /* Default entry is deleted at vxlan_uninit. */ .ignore_default_entry = true, .state = 0, .state_mask = NUD_PERMANENT | NUD_NOARP, }; vxlan_multicast_leave(vxlan); del_timer_sync(&vxlan->age_timer); vxlan_flush(vxlan, &desc); vxlan_sock_release(vxlan); return 0; } /* Stub, nothing needs to be done. */ static void vxlan_set_multicast_list(struct net_device *dev) { } static int vxlan_change_mtu(struct net_device *dev, int new_mtu) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; struct net_device *lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex); /* This check is different than dev->max_mtu, because it looks at * the lowerdev->mtu, rather than the static dev->max_mtu */ if (lowerdev) { int max_mtu = lowerdev->mtu - vxlan_headroom(vxlan->cfg.flags); if (new_mtu > max_mtu) return -EINVAL; } WRITE_ONCE(dev->mtu, new_mtu); return 0; } static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) { struct vxlan_dev *vxlan = netdev_priv(dev); struct ip_tunnel_info *info = skb_tunnel_info(skb); __be16 sport, dport; sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, vxlan->cfg.port_max, true); dport = info->key.tp_dst ? : vxlan->cfg.dst_port; if (ip_tunnel_info_af(info) == AF_INET) { struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); struct rtable *rt; if (!sock4) return -EIO; rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, 0, &info->key.u.ipv4.src, &info->key, sport, dport, info->key.tos, &info->dst_cache); if (IS_ERR(rt)) return PTR_ERR(rt); ip_rt_put(rt); } else { #if IS_ENABLED(CONFIG_IPV6) struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct dst_entry *ndst; if (!sock6) return -EIO; ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock, 0, &info->key.u.ipv6.src, &info->key, sport, dport, info->key.tos, &info->dst_cache); if (IS_ERR(ndst)) return PTR_ERR(ndst); dst_release(ndst); #else /* !CONFIG_IPV6 */ return -EPFNOSUPPORT; #endif } info->key.tp_src = sport; info->key.tp_dst = dport; return 0; } static const struct net_device_ops vxlan_netdev_ether_ops = { .ndo_init = vxlan_init, .ndo_uninit = vxlan_uninit, .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_xmit, .ndo_set_rx_mode = vxlan_set_multicast_list, .ndo_change_mtu = vxlan_change_mtu, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = eth_mac_addr, .ndo_fdb_add = vxlan_fdb_add, .ndo_fdb_del = vxlan_fdb_delete, .ndo_fdb_del_bulk = vxlan_fdb_delete_bulk, .ndo_fdb_dump = vxlan_fdb_dump, .ndo_fdb_get = vxlan_fdb_get, .ndo_mdb_add = vxlan_mdb_add, .ndo_mdb_del = vxlan_mdb_del, .ndo_mdb_del_bulk = vxlan_mdb_del_bulk, .ndo_mdb_dump = vxlan_mdb_dump, .ndo_mdb_get = vxlan_mdb_get, .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, }; static const struct net_device_ops vxlan_netdev_raw_ops = { .ndo_init = vxlan_init, .ndo_uninit = vxlan_uninit, .ndo_open = vxlan_open, .ndo_stop = vxlan_stop, .ndo_start_xmit = vxlan_xmit, .ndo_change_mtu = vxlan_change_mtu, .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, }; /* Info for udev, that this is a virtual tunnel endpoint */ static const struct device_type vxlan_type = { .name = "vxlan", }; /* Calls the ndo_udp_tunnel_add of the caller in order to * supply the listening VXLAN udp ports. Callers are expected * to implement the ndo_udp_tunnel_add. */ static void vxlan_offload_rx_ports(struct net_device *dev, bool push) { struct vxlan_sock *vs; struct net *net = dev_net(dev); struct vxlan_net *vn = net_generic(net, vxlan_net_id); unsigned int i; spin_lock(&vn->sock_lock); for (i = 0; i < PORT_HASH_SIZE; ++i) { hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { unsigned short type; if (vs->flags & VXLAN_F_GPE) type = UDP_TUNNEL_TYPE_VXLAN_GPE; else type = UDP_TUNNEL_TYPE_VXLAN; if (push) udp_tunnel_push_rx_port(dev, vs->sock, type); else udp_tunnel_drop_rx_port(dev, vs->sock, type); } } spin_unlock(&vn->sock_lock); } /* Initialize the device structure. */ static void vxlan_setup(struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); unsigned int h; eth_hw_addr_random(dev); ether_setup(dev); dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &vxlan_type); dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; dev->vlan_features = dev->features; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->hw_features |= NETIF_F_RXCSUM; dev->hw_features |= NETIF_F_GSO_SOFTWARE; netif_keep_dst(dev); dev->priv_flags |= IFF_NO_QUEUE; dev->change_proto_down = true; dev->lltx = true; /* MTU range: 68 - 65535 */ dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = ETH_MAX_MTU; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; INIT_LIST_HEAD(&vxlan->next); timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); vxlan->dev = dev; for (h = 0; h < FDB_HASH_SIZE; ++h) { spin_lock_init(&vxlan->hash_lock[h]); INIT_HLIST_HEAD(&vxlan->fdb_head[h]); } } static void vxlan_ether_setup(struct net_device *dev) { dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; dev->netdev_ops = &vxlan_netdev_ether_ops; } static void vxlan_raw_setup(struct net_device *dev) { dev->header_ops = NULL; dev->type = ARPHRD_NONE; dev->hard_header_len = 0; dev->addr_len = 0; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->netdev_ops = &vxlan_netdev_raw_ops; } static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_UNSPEC] = { .strict_start_type = IFLA_VXLAN_LOCALBYPASS }, [IFLA_VXLAN_ID] = { .type = NLA_U32 }, [IFLA_VXLAN_GROUP] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, [IFLA_VXLAN_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, [IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) }, [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, [IFLA_VXLAN_LABEL] = { .type = NLA_U32 }, [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, [IFLA_VXLAN_AGEING] = { .type = NLA_U32 }, [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 }, [IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) }, [IFLA_VXLAN_PROXY] = { .type = NLA_U8 }, [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, [IFLA_VXLAN_COLLECT_METADATA] = { .type = NLA_U8 }, [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, [IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 }, [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 }, [IFLA_VXLAN_GBP] = { .type = NLA_FLAG, }, [IFLA_VXLAN_GPE] = { .type = NLA_FLAG, }, [IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG }, [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, [IFLA_VXLAN_DF] = { .type = NLA_U8 }, [IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 }, [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX), [IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)), }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], "Provided link layer address is not Ethernet"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], "Provided Ethernet address is not unicast"); return -EADDRNOTAVAIL; } } if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], "MTU must be between 68 and 65535"); return -EINVAL; } } if (!data) { NL_SET_ERR_MSG(extack, "Required attributes not provided to perform the operation"); return -EINVAL; } if (data[IFLA_VXLAN_ID]) { u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); if (id >= VXLAN_N_VID) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_ID], "VXLAN ID must be lower than 16777216"); return -ERANGE; } } if (data[IFLA_VXLAN_PORT_RANGE]) { const struct ifla_vxlan_port_range *p = nla_data(data[IFLA_VXLAN_PORT_RANGE]); if (ntohs(p->high) < ntohs(p->low)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_PORT_RANGE], "Invalid source port range"); return -EINVAL; } } if (data[IFLA_VXLAN_DF]) { enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]); if (df < 0 || df > VXLAN_DF_MAX) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_DF], "Invalid DF attribute"); return -EINVAL; } } return 0; } static void vxlan_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version)); strscpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver)); } static int vxlan_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *cmd) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; struct net_device *lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex); if (!lowerdev) { cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; cmd->base.speed = SPEED_UNKNOWN; return 0; } return __ethtool_get_link_ksettings(lowerdev, cmd); } static const struct ethtool_ops vxlan_ethtool_ops = { .get_drvinfo = vxlan_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = vxlan_get_link_ksettings, }; static struct socket *vxlan_create_sock(struct net *net, bool ipv6, __be16 port, u32 flags, int ifindex) { struct socket *sock; struct udp_port_cfg udp_conf; int err; memset(&udp_conf, 0, sizeof(udp_conf)); if (ipv6) { udp_conf.family = AF_INET6; udp_conf.use_udp6_rx_checksums = !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX); udp_conf.ipv6_v6only = 1; } else { udp_conf.family = AF_INET; } udp_conf.local_udp_port = port; udp_conf.bind_ifindex = ifindex; /* Open UDP socket */ err = udp_sock_create(net, &udp_conf, &sock); if (err < 0) return ERR_PTR(err); udp_allow_gso(sock->sk); return sock; } /* Create new listen socket if needed */ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, __be16 port, u32 flags, int ifindex) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; struct socket *sock; unsigned int h; struct udp_tunnel_sock_cfg tunnel_cfg; vs = kzalloc(sizeof(*vs), GFP_KERNEL); if (!vs) return ERR_PTR(-ENOMEM); for (h = 0; h < VNI_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vs->vni_list[h]); sock = vxlan_create_sock(net, ipv6, port, flags, ifindex); if (IS_ERR(sock)) { kfree(vs); return ERR_CAST(sock); } vs->sock = sock; refcount_set(&vs->refcnt, 1); vs->flags = (flags & VXLAN_F_RCV_FLAGS); spin_lock(&vn->sock_lock); hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); udp_tunnel_notify_add_rx_port(sock, (vs->flags & VXLAN_F_GPE) ? UDP_TUNNEL_TYPE_VXLAN_GPE : UDP_TUNNEL_TYPE_VXLAN); spin_unlock(&vn->sock_lock); /* Mark socket as an encapsulation socket. */ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.sk_user_data = vs; tunnel_cfg.encap_type = 1; tunnel_cfg.encap_rcv = vxlan_rcv; tunnel_cfg.encap_err_lookup = vxlan_err_lookup; tunnel_cfg.encap_destroy = NULL; if (vs->flags & VXLAN_F_GPE) { tunnel_cfg.gro_receive = vxlan_gpe_gro_receive; tunnel_cfg.gro_complete = vxlan_gpe_gro_complete; } else { tunnel_cfg.gro_receive = vxlan_gro_receive; tunnel_cfg.gro_complete = vxlan_gro_complete; } setup_udp_tunnel_sock(net, sock, &tunnel_cfg); return vs; } static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6) { struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA; struct vxlan_sock *vs = NULL; struct vxlan_dev_node *node; int l3mdev_index = 0; if (vxlan->cfg.remote_ifindex) l3mdev_index = l3mdev_master_upper_ifindex_by_index( vxlan->net, vxlan->cfg.remote_ifindex); if (!vxlan->cfg.no_share) { spin_lock(&vn->sock_lock); vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, vxlan->cfg.dst_port, vxlan->cfg.flags, l3mdev_index); if (vs && !refcount_inc_not_zero(&vs->refcnt)) { spin_unlock(&vn->sock_lock); return -EBUSY; } spin_unlock(&vn->sock_lock); } if (!vs) vs = vxlan_socket_create(vxlan->net, ipv6, vxlan->cfg.dst_port, vxlan->cfg.flags, l3mdev_index); if (IS_ERR(vs)) return PTR_ERR(vs); #if IS_ENABLED(CONFIG_IPV6) if (ipv6) { rcu_assign_pointer(vxlan->vn6_sock, vs); node = &vxlan->hlist6; } else #endif { rcu_assign_pointer(vxlan->vn4_sock, vs); node = &vxlan->hlist4; } if (metadata && (vxlan->cfg.flags & VXLAN_F_VNIFILTER)) vxlan_vs_add_vnigrp(vxlan, vs, ipv6); else vxlan_vs_add_dev(vs, vxlan, node); return 0; } static int vxlan_sock_add(struct vxlan_dev *vxlan) { bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA; bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata; bool ipv4 = !ipv6 || metadata; int ret = 0; RCU_INIT_POINTER(vxlan->vn4_sock, NULL); #if IS_ENABLED(CONFIG_IPV6) RCU_INIT_POINTER(vxlan->vn6_sock, NULL); if (ipv6) { ret = __vxlan_sock_add(vxlan, true); if (ret < 0 && ret != -EAFNOSUPPORT) ipv4 = false; } #endif if (ipv4) ret = __vxlan_sock_add(vxlan, false); if (ret < 0) vxlan_sock_release(vxlan); return ret; } int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan, struct vxlan_config *conf, __be32 vni) { struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); struct vxlan_dev *tmp; list_for_each_entry(tmp, &vn->vxlan_list, next) { if (tmp == vxlan) continue; if (tmp->cfg.flags & VXLAN_F_VNIFILTER) { if (!vxlan_vnifilter_lookup(tmp, vni)) continue; } else if (tmp->cfg.vni != vni) { continue; } if (tmp->cfg.dst_port != conf->dst_port) continue; if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) != (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6))) continue; if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) && tmp->cfg.remote_ifindex != conf->remote_ifindex) continue; return -EEXIST; } return 0; } static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, struct net_device **lower, struct vxlan_dev *old, struct netlink_ext_ack *extack) { bool use_ipv6 = false; if (conf->flags & VXLAN_F_GPE) { /* For now, allow GPE only together with * COLLECT_METADATA. This can be relaxed later; in such * case, the other side of the PtP link will have to be * provided. */ if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) || !(conf->flags & VXLAN_F_COLLECT_METADATA)) { NL_SET_ERR_MSG(extack, "VXLAN GPE does not support this combination of attributes"); return -EINVAL; } } if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) { /* Unless IPv6 is explicitly requested, assume IPv4 */ conf->remote_ip.sa.sa_family = AF_INET; conf->saddr.sa.sa_family = AF_INET; } else if (!conf->remote_ip.sa.sa_family) { conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family; } else if (!conf->saddr.sa.sa_family) { conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family; } if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) { NL_SET_ERR_MSG(extack, "Local and remote address must be from the same family"); return -EINVAL; } if (vxlan_addr_multicast(&conf->saddr)) { NL_SET_ERR_MSG(extack, "Local address cannot be multicast"); return -EINVAL; } if (conf->saddr.sa.sa_family == AF_INET6) { if (!IS_ENABLED(CONFIG_IPV6)) { NL_SET_ERR_MSG(extack, "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; } use_ipv6 = true; conf->flags |= VXLAN_F_IPV6; if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) { int local_type = ipv6_addr_type(&conf->saddr.sin6.sin6_addr); int remote_type = ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr); if (local_type & IPV6_ADDR_LINKLOCAL) { if (!(remote_type & IPV6_ADDR_LINKLOCAL) && (remote_type != IPV6_ADDR_ANY)) { NL_SET_ERR_MSG(extack, "Invalid combination of local and remote address scopes"); return -EINVAL; } conf->flags |= VXLAN_F_IPV6_LINKLOCAL; } else { if (remote_type == (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) { NL_SET_ERR_MSG(extack, "Invalid combination of local and remote address scopes"); return -EINVAL; } conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL; } } } if (conf->label && !use_ipv6) { NL_SET_ERR_MSG(extack, "Label attribute only applies to IPv6 VXLAN devices"); return -EINVAL; } if (conf->label_policy && !use_ipv6) { NL_SET_ERR_MSG(extack, "Label policy only applies to IPv6 VXLAN devices"); return -EINVAL; } if (conf->remote_ifindex) { struct net_device *lowerdev; lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex); if (!lowerdev) { NL_SET_ERR_MSG(extack, "Invalid local interface, device not found"); return -ENODEV; } #if IS_ENABLED(CONFIG_IPV6) if (use_ipv6) { struct inet6_dev *idev = __in6_dev_get(lowerdev); if (idev && idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 support disabled by administrator"); return -EPERM; } } #endif *lower = lowerdev; } else { if (vxlan_addr_multicast(&conf->remote_ip)) { NL_SET_ERR_MSG(extack, "Local interface required for multicast remote destination"); return -EINVAL; } #if IS_ENABLED(CONFIG_IPV6) if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) { NL_SET_ERR_MSG(extack, "Local interface required for link-local local/remote addresses"); return -EINVAL; } #endif *lower = NULL; } if (!conf->dst_port) { if (conf->flags & VXLAN_F_GPE) conf->dst_port = htons(IANA_VXLAN_GPE_UDP_PORT); else conf->dst_port = htons(vxlan_port); } if (!conf->age_interval) conf->age_interval = FDB_AGE_DEFAULT; if (vxlan_vni_in_use(src_net, old, conf, conf->vni)) { NL_SET_ERR_MSG(extack, "A VXLAN device with the specified VNI already exists"); return -EEXIST; } return 0; } static void vxlan_config_apply(struct net_device *dev, struct vxlan_config *conf, struct net_device *lowerdev, struct net *src_net, bool changelink) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; unsigned short needed_headroom = ETH_HLEN; int max_mtu = ETH_MAX_MTU; u32 flags = conf->flags; if (!changelink) { if (flags & VXLAN_F_GPE) vxlan_raw_setup(dev); else vxlan_ether_setup(dev); if (conf->mtu) dev->mtu = conf->mtu; vxlan->net = src_net; } dst->remote_vni = conf->vni; memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip)); if (lowerdev) { dst->remote_ifindex = conf->remote_ifindex; netif_inherit_tso_max(dev, lowerdev); needed_headroom = lowerdev->hard_header_len; needed_headroom += lowerdev->needed_headroom; dev->needed_tailroom = lowerdev->needed_tailroom; max_mtu = lowerdev->mtu - vxlan_headroom(flags); if (max_mtu < ETH_MIN_MTU) max_mtu = ETH_MIN_MTU; if (!changelink && !conf->mtu) dev->mtu = max_mtu; } if (dev->mtu > max_mtu) dev->mtu = max_mtu; if (flags & VXLAN_F_COLLECT_METADATA) flags |= VXLAN_F_IPV6; needed_headroom += vxlan_headroom(flags); dev->needed_headroom = needed_headroom; memcpy(&vxlan->cfg, conf, sizeof(*conf)); } static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, struct vxlan_config *conf, bool changelink, struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *lowerdev; int ret; ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack); if (ret) return ret; vxlan_config_apply(dev, conf, lowerdev, src_net, changelink); return 0; } static int __vxlan_dev_create(struct net *net, struct net_device *dev, struct vxlan_config *conf, struct netlink_ext_ack *extack) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *remote_dev = NULL; struct vxlan_fdb *f = NULL; bool unregister = false; struct vxlan_rdst *dst; int err; dst = &vxlan->default_dst; err = vxlan_dev_configure(net, dev, conf, false, extack); if (err) return err; dev->ethtool_ops = &vxlan_ethtool_ops; /* create an fdb entry for a valid default destination */ if (!vxlan_addr_any(&dst->remote_ip)) { err = vxlan_fdb_create(vxlan, all_zeros_mac, &dst->remote_ip, NUD_REACHABLE | NUD_PERMANENT, vxlan->cfg.dst_port, dst->remote_vni, dst->remote_vni, dst->remote_ifindex, NTF_SELF, 0, &f, extack); if (err) return err; } err = register_netdevice(dev); if (err) goto errout; unregister = true; if (dst->remote_ifindex) { remote_dev = __dev_get_by_index(net, dst->remote_ifindex); if (!remote_dev) { err = -ENODEV; goto errout; } err = netdev_upper_dev_link(remote_dev, dev, extack); if (err) goto errout; } err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) goto unlink; if (f) { vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f); /* notify default fdb entry */ err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, true, extack); if (err) { vxlan_fdb_destroy(vxlan, f, false, false); if (remote_dev) netdev_upper_dev_unlink(remote_dev, dev); goto unregister; } } list_add(&vxlan->next, &vn->vxlan_list); if (remote_dev) dst->remote_dev = remote_dev; return 0; unlink: if (remote_dev) netdev_upper_dev_unlink(remote_dev, dev); errout: /* unregister_netdevice() destroys the default FDB entry with deletion * notification. But the addition notification was not sent yet, so * destroy the entry by hand here. */ if (f) __vxlan_fdb_free(f); unregister: if (unregister) unregister_netdevice(dev); return err; } /* Set/clear flags based on attribute */ static int vxlan_nl2flag(struct vxlan_config *conf, struct nlattr *tb[], int attrtype, unsigned long mask, bool changelink, bool changelink_supported, struct netlink_ext_ack *extack) { unsigned long flags; if (!tb[attrtype]) return 0; if (changelink && !changelink_supported) { vxlan_flag_attr_error(attrtype, extack); return -EOPNOTSUPP; } if (vxlan_policy[attrtype].type == NLA_FLAG) flags = conf->flags | mask; else if (nla_get_u8(tb[attrtype])) flags = conf->flags | mask; else flags = conf->flags & ~mask; conf->flags = flags; return 0; } static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], struct net_device *dev, struct vxlan_config *conf, bool changelink, struct netlink_ext_ack *extack) { struct vxlanhdr used_bits = { .vx_flags = VXLAN_HF_VNI, .vx_vni = VXLAN_VNI_MASK, }; struct vxlan_dev *vxlan = netdev_priv(dev); int err = 0; memset(conf, 0, sizeof(*conf)); /* if changelink operation, start with old existing cfg */ if (changelink) memcpy(conf, &vxlan->cfg, sizeof(*conf)); if (data[IFLA_VXLAN_ID]) { __be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID])); if (changelink && (vni != conf->vni)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], "Cannot change VNI"); return -EOPNOTSUPP; } conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID])); } if (data[IFLA_VXLAN_GROUP]) { if (changelink && (conf->remote_ip.sa.sa_family != AF_INET)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP], "New group address family does not match old group"); return -EOPNOTSUPP; } conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]); conf->remote_ip.sa.sa_family = AF_INET; } else if (data[IFLA_VXLAN_GROUP6]) { if (!IS_ENABLED(CONFIG_IPV6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; } if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "New group address family does not match old group"); return -EOPNOTSUPP; } conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]); conf->remote_ip.sa.sa_family = AF_INET6; } if (data[IFLA_VXLAN_LOCAL]) { if (changelink && (conf->saddr.sa.sa_family != AF_INET)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL], "New local address family does not match old"); return -EOPNOTSUPP; } conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]); conf->saddr.sa.sa_family = AF_INET; } else if (data[IFLA_VXLAN_LOCAL6]) { if (!IS_ENABLED(CONFIG_IPV6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "IPv6 support not enabled in the kernel"); return -EPFNOSUPPORT; } if (changelink && (conf->saddr.sa.sa_family != AF_INET6)) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "New local address family does not match old"); return -EOPNOTSUPP; } /* TODO: respect scope id */ conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]); conf->saddr.sa.sa_family = AF_INET6; } if (data[IFLA_VXLAN_LINK]) conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]); if (data[IFLA_VXLAN_TOS]) conf->tos = nla_get_u8(data[IFLA_VXLAN_TOS]); if (data[IFLA_VXLAN_TTL]) conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); if (data[IFLA_VXLAN_TTL_INHERIT]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_TTL_INHERIT, VXLAN_F_TTL_INHERIT, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_LABEL]) conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) & IPV6_FLOWLABEL_MASK; if (data[IFLA_VXLAN_LABEL_POLICY]) conf->label_policy = nla_get_u32(data[IFLA_VXLAN_LABEL_POLICY]); if (data[IFLA_VXLAN_LEARNING]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING, VXLAN_F_LEARN, changelink, true, extack); if (err) return err; } else if (!changelink) { /* default to learn on a new device */ conf->flags |= VXLAN_F_LEARN; } if (data[IFLA_VXLAN_AGEING]) conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); if (data[IFLA_VXLAN_PROXY]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_PROXY, VXLAN_F_PROXY, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_RSC]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_RSC, VXLAN_F_RSC, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_L2MISS]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L2MISS, VXLAN_F_L2MISS, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_L3MISS]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L3MISS, VXLAN_F_L3MISS, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_LIMIT]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LIMIT], "Cannot change limit"); return -EOPNOTSUPP; } conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); } if (data[IFLA_VXLAN_COLLECT_METADATA]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_COLLECT_METADATA, VXLAN_F_COLLECT_METADATA, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_PORT_RANGE]) { if (!changelink) { const struct ifla_vxlan_port_range *p = nla_data(data[IFLA_VXLAN_PORT_RANGE]); conf->port_min = ntohs(p->low); conf->port_max = ntohs(p->high); } else { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE], "Cannot change port range"); return -EOPNOTSUPP; } } if (data[IFLA_VXLAN_PORT]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT], "Cannot change port"); return -EOPNOTSUPP; } conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]); } if (data[IFLA_VXLAN_UDP_CSUM]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_UDP_CSUM], "Cannot change UDP_CSUM flag"); return -EOPNOTSUPP; } if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM])) conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX; } if (data[IFLA_VXLAN_LOCALBYPASS]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LOCALBYPASS, VXLAN_F_LOCALBYPASS, changelink, true, extack); if (err) return err; } else if (!changelink) { /* default to local bypass on a new device */ conf->flags |= VXLAN_F_LOCALBYPASS; } if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, VXLAN_F_UDP_ZERO_CSUM6_TX, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, VXLAN_F_UDP_ZERO_CSUM6_RX, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_REMCSUM_TX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_TX, VXLAN_F_REMCSUM_TX, changelink, false, extack); if (err) return err; } if (data[IFLA_VXLAN_REMCSUM_RX]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_RX, VXLAN_F_REMCSUM_RX, changelink, false, extack); if (err) return err; used_bits.vx_flags |= VXLAN_HF_RCO; used_bits.vx_vni |= ~VXLAN_VNI_MASK; } if (data[IFLA_VXLAN_GBP]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GBP, VXLAN_F_GBP, changelink, false, extack); if (err) return err; used_bits.vx_flags |= VXLAN_GBP_USED_BITS; } if (data[IFLA_VXLAN_GPE]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GPE, VXLAN_F_GPE, changelink, false, extack); if (err) return err; used_bits.vx_flags |= VXLAN_GPE_USED_BITS; } if (data[IFLA_VXLAN_RESERVED_BITS]) { struct vxlanhdr reserved_bits; if (changelink) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_RESERVED_BITS], "Cannot change reserved_bits"); return -EOPNOTSUPP; } nla_memcpy(&reserved_bits, data[IFLA_VXLAN_RESERVED_BITS], sizeof(reserved_bits)); if (used_bits.vx_flags & reserved_bits.vx_flags || used_bits.vx_vni & reserved_bits.vx_vni) { __be64 ub_be64, rb_be64; memcpy(&ub_be64, &used_bits, sizeof(ub_be64)); memcpy(&rb_be64, &reserved_bits, sizeof(rb_be64)); NL_SET_ERR_MSG_ATTR_FMT(extack, data[IFLA_VXLAN_RESERVED_BITS], "Used bits %#018llx cannot overlap reserved bits %#018llx", be64_to_cpu(ub_be64), be64_to_cpu(rb_be64)); return -EINVAL; } conf->reserved_bits = reserved_bits; } else { /* For backwards compatibility, only allow reserved fields to be * used by VXLAN extensions if explicitly requested. */ conf->reserved_bits = (struct vxlanhdr) { .vx_flags = ~used_bits.vx_flags, .vx_vni = ~used_bits.vx_vni, }; } if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL, VXLAN_F_REMCSUM_NOPARTIAL, changelink, false, extack); if (err) return err; } if (tb[IFLA_MTU]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], "Cannot change mtu"); return -EOPNOTSUPP; } conf->mtu = nla_get_u32(tb[IFLA_MTU]); } if (data[IFLA_VXLAN_DF]) conf->df = nla_get_u8(data[IFLA_VXLAN_DF]); if (data[IFLA_VXLAN_VNIFILTER]) { err = vxlan_nl2flag(conf, data, IFLA_VXLAN_VNIFILTER, VXLAN_F_VNIFILTER, changelink, false, extack); if (err) return err; if ((conf->flags & VXLAN_F_VNIFILTER) && !(conf->flags & VXLAN_F_COLLECT_METADATA)) { NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_VNIFILTER], "vxlan vnifilter only valid in collect metadata mode"); return -EINVAL; } } return 0; } static int vxlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct vxlan_config conf; int err; err = vxlan_nl2conf(tb, data, dev, &conf, false, extack); if (err) return err; return __vxlan_dev_create(src_net, dev, &conf, extack); } static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); struct net_device *lowerdev; struct vxlan_config conf; struct vxlan_rdst *dst; int err; dst = &vxlan->default_dst; err = vxlan_nl2conf(tb, data, dev, &conf, true, extack); if (err) return err; err = vxlan_config_validate(vxlan->net, &conf, &lowerdev, vxlan, extack); if (err) return err; if (dst->remote_dev == lowerdev) lowerdev = NULL; err = netdev_adjacent_change_prepare(dst->remote_dev, lowerdev, dev, extack); if (err) return err; /* handle default dst entry */ if (!vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip)) { u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); if (!vxlan_addr_any(&conf.remote_ip)) { err = vxlan_fdb_update(vxlan, all_zeros_mac, &conf.remote_ip, NUD_REACHABLE | NUD_PERMANENT, NLM_F_APPEND | NLM_F_CREATE, vxlan->cfg.dst_port, conf.vni, conf.vni, conf.remote_ifindex, NTF_SELF, 0, true, extack); if (err) { spin_unlock_bh(&vxlan->hash_lock[hash_index]); netdev_adjacent_change_abort(dst->remote_dev, lowerdev, dev); return err; } } if (!vxlan_addr_any(&dst->remote_ip)) __vxlan_fdb_delete(vxlan, all_zeros_mac, dst->remote_ip, vxlan->cfg.dst_port, dst->remote_vni, dst->remote_vni, dst->remote_ifindex, true); spin_unlock_bh(&vxlan->hash_lock[hash_index]); /* If vni filtering device, also update fdb entries of * all vnis that were using default remote ip */ if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) { err = vxlan_vnilist_update_group(vxlan, &dst->remote_ip, &conf.remote_ip, extack); if (err) { netdev_adjacent_change_abort(dst->remote_dev, lowerdev, dev); return err; } } } if (conf.age_interval != vxlan->cfg.age_interval) mod_timer(&vxlan->age_timer, jiffies); netdev_adjacent_change_commit(dst->remote_dev, lowerdev, dev); if (lowerdev && lowerdev != dst->remote_dev) dst->remote_dev = lowerdev; vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true); return 0; } static void vxlan_dellink(struct net_device *dev, struct list_head *head) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb_flush_desc desc = { /* Default entry is deleted at vxlan_uninit. */ .ignore_default_entry = true, }; vxlan_flush(vxlan, &desc); list_del(&vxlan->next); unregister_netdevice_queue(dev, head); if (vxlan->default_dst.remote_dev) netdev_upper_dev_unlink(vxlan->default_dst.remote_dev, dev); } static size_t vxlan_get_size(const struct net_device *dev) { return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL_INHERIT */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_DF */ nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LABEL_POLICY */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_COLLECT_METADATA */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */ /* IFLA_VXLAN_PORT_RANGE */ nla_total_size(sizeof(struct ifla_vxlan_port_range)) + nla_total_size(0) + /* IFLA_VXLAN_GBP */ nla_total_size(0) + /* IFLA_VXLAN_GPE */ nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */ /* IFLA_VXLAN_RESERVED_BITS */ nla_total_size(sizeof(struct vxlanhdr)) + 0; } static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { const struct vxlan_dev *vxlan = netdev_priv(dev); const struct vxlan_rdst *dst = &vxlan->default_dst; struct ifla_vxlan_port_range ports = { .low = htons(vxlan->cfg.port_min), .high = htons(vxlan->cfg.port_max), }; if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni))) goto nla_put_failure; if (!vxlan_addr_any(&dst->remote_ip)) { if (dst->remote_ip.sa.sa_family == AF_INET) { if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP, dst->remote_ip.sin.sin_addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) } else { if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6, &dst->remote_ip.sin6.sin6_addr)) goto nla_put_failure; #endif } } if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) goto nla_put_failure; if (!vxlan_addr_any(&vxlan->cfg.saddr)) { if (vxlan->cfg.saddr.sa.sa_family == AF_INET) { if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL, vxlan->cfg.saddr.sin.sin_addr.s_addr)) goto nla_put_failure; #if IS_ENABLED(CONFIG_IPV6) } else { if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6, &vxlan->cfg.saddr.sin6.sin6_addr)) goto nla_put_failure; #endif } } if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) || nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT, !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) || nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) || nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) || nla_put_u32(skb, IFLA_VXLAN_LABEL_POLICY, vxlan->cfg.label_policy) || nla_put_u8(skb, IFLA_VXLAN_LEARNING, !!(vxlan->cfg.flags & VXLAN_F_LEARN)) || nla_put_u8(skb, IFLA_VXLAN_PROXY, !!(vxlan->cfg.flags & VXLAN_F_PROXY)) || nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->cfg.flags & VXLAN_F_RSC)) || nla_put_u8(skb, IFLA_VXLAN_L2MISS, !!(vxlan->cfg.flags & VXLAN_F_L2MISS)) || nla_put_u8(skb, IFLA_VXLAN_L3MISS, !!(vxlan->cfg.flags & VXLAN_F_L3MISS)) || nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA, !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) || nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) || nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) || nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) || nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM, !(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) || nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) || nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) || nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) || nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)) || nla_put_u8(skb, IFLA_VXLAN_LOCALBYPASS, !!(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS))) goto nla_put_failure; if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_GBP && nla_put_flag(skb, IFLA_VXLAN_GBP)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_GPE && nla_put_flag(skb, IFLA_VXLAN_GPE)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL && nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL)) goto nla_put_failure; if (vxlan->cfg.flags & VXLAN_F_VNIFILTER && nla_put_u8(skb, IFLA_VXLAN_VNIFILTER, !!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))) goto nla_put_failure; if (nla_put(skb, IFLA_VXLAN_RESERVED_BITS, sizeof(vxlan->cfg.reserved_bits), &vxlan->cfg.reserved_bits)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct net *vxlan_get_link_net(const struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); return READ_ONCE(vxlan->net); } static struct rtnl_link_ops vxlan_link_ops __read_mostly = { .kind = "vxlan", .maxtype = IFLA_VXLAN_MAX, .policy = vxlan_policy, .priv_size = sizeof(struct vxlan_dev), .setup = vxlan_setup, .validate = vxlan_validate, .newlink = vxlan_newlink, .changelink = vxlan_changelink, .dellink = vxlan_dellink, .get_size = vxlan_get_size, .fill_info = vxlan_fill_info, .get_link_net = vxlan_get_link_net, }; struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf) { struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; int err; memset(&tb, 0, sizeof(tb)); dev = rtnl_create_link(net, name, name_assign_type, &vxlan_link_ops, tb, NULL); if (IS_ERR(dev)) return dev; err = __vxlan_dev_create(net, dev, conf, NULL); if (err < 0) { free_netdev(dev); return ERR_PTR(err); } err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) { LIST_HEAD(list_kill); vxlan_dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); return ERR_PTR(err); } return dev; } EXPORT_SYMBOL_GPL(vxlan_dev_create); static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn, struct net_device *dev) { struct vxlan_dev *vxlan, *next; LIST_HEAD(list_kill); list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { struct vxlan_rdst *dst = &vxlan->default_dst; /* In case we created vxlan device with carrier * and we loose the carrier due to module unload * we also need to remove vxlan device. In other * cases, it's not necessary and remote_ifindex * is 0 here, so no matches. */ if (dst->remote_ifindex == dev->ifindex) vxlan_dellink(vxlan->dev, &list_kill); } unregister_netdevice_many(&list_kill); } static int vxlan_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); if (event == NETDEV_UNREGISTER) vxlan_handle_lowerdev_unregister(vn, dev); else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO) vxlan_offload_rx_ports(dev, true); else if (event == NETDEV_UDP_TUNNEL_DROP_INFO) vxlan_offload_rx_ports(dev, false); return NOTIFY_DONE; } static struct notifier_block vxlan_notifier_block __read_mostly = { .notifier_call = vxlan_netdevice_event, }; static void vxlan_fdb_offloaded_set(struct net_device *dev, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *rdst; struct vxlan_fdb *f; u32 hash_index; hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) goto out; rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip, fdb_info->remote_port, fdb_info->remote_vni, fdb_info->remote_ifindex); if (!rdst) goto out; rdst->offloaded = fdb_info->offloaded; out: spin_unlock_bh(&vxlan->hash_lock[hash_index]); } static int vxlan_fdb_external_learn_add(struct net_device *dev, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); struct netlink_ext_ack *extack; u32 hash_index; int err; hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); extack = switchdev_notifier_info_to_extack(&fdb_info->info); spin_lock_bh(&vxlan->hash_lock[hash_index]); err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip, NUD_REACHABLE, NLM_F_CREATE | NLM_F_REPLACE, fdb_info->remote_port, fdb_info->vni, fdb_info->remote_vni, fdb_info->remote_ifindex, NTF_USE | NTF_SELF | NTF_EXT_LEARNED, 0, false, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; } static int vxlan_fdb_external_learn_del(struct net_device *dev, struct switchdev_notifier_vxlan_fdb_info *fdb_info) { struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_fdb *f; u32 hash_index; int err = 0; hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); if (!f) err = -ENOENT; else if (f->flags & NTF_EXT_LEARNED) err = __vxlan_fdb_delete(vxlan, fdb_info->eth_addr, fdb_info->remote_ip, fdb_info->remote_port, fdb_info->vni, fdb_info->remote_vni, fdb_info->remote_ifindex, false); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; } static int vxlan_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct switchdev_notifier_vxlan_fdb_info *fdb_info; int err = 0; switch (event) { case SWITCHDEV_VXLAN_FDB_OFFLOADED: vxlan_fdb_offloaded_set(dev, ptr); break; case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE: fdb_info = ptr; err = vxlan_fdb_external_learn_add(dev, fdb_info); if (err) { err = notifier_from_errno(err); break; } fdb_info->offloaded = true; vxlan_fdb_offloaded_set(dev, fdb_info); break; case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE: fdb_info = ptr; err = vxlan_fdb_external_learn_del(dev, fdb_info); if (err) { err = notifier_from_errno(err); break; } fdb_info->offloaded = false; vxlan_fdb_offloaded_set(dev, fdb_info); break; } return err; } static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = { .notifier_call = vxlan_switchdev_event, }; static void vxlan_fdb_nh_flush(struct nexthop *nh) { struct vxlan_fdb *fdb; struct vxlan_dev *vxlan; u32 hash_index; rcu_read_lock(); list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) { vxlan = rcu_dereference(fdb->vdev); WARN_ON(!vxlan); hash_index = fdb_head_index(vxlan, fdb->eth_addr, vxlan->default_dst.remote_vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); if (!hlist_unhashed(&fdb->hlist)) vxlan_fdb_destroy(vxlan, fdb, false, false); spin_unlock_bh(&vxlan->hash_lock[hash_index]); } rcu_read_unlock(); } static int vxlan_nexthop_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct nh_notifier_info *info = ptr; struct nexthop *nh; if (event != NEXTHOP_EVENT_DEL) return NOTIFY_DONE; nh = nexthop_find_by_id(info->net, info->id); if (!nh) return NOTIFY_DONE; vxlan_fdb_nh_flush(nh); return NOTIFY_DONE; } static __net_init int vxlan_init_net(struct net *net) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); unsigned int h; INIT_LIST_HEAD(&vn->vxlan_list); spin_lock_init(&vn->sock_lock); vn->nexthop_notifier_block.notifier_call = vxlan_nexthop_event; for (h = 0; h < PORT_HASH_SIZE; ++h) INIT_HLIST_HEAD(&vn->sock_list[h]); return register_nexthop_notifier(net, &vn->nexthop_notifier_block, NULL); } static void __net_exit vxlan_destroy_tunnels(struct vxlan_net *vn, struct list_head *dev_to_kill) { struct vxlan_dev *vxlan, *next; list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) vxlan_dellink(vxlan->dev, dev_to_kill); } static void __net_exit vxlan_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); __unregister_nexthop_notifier(net, &vn->nexthop_notifier_block); vxlan_destroy_tunnels(vn, dev_to_kill); } } static void __net_exit vxlan_exit_net(struct net *net) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); unsigned int h; for (h = 0; h < PORT_HASH_SIZE; ++h) WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); } static struct pernet_operations vxlan_net_ops = { .init = vxlan_init_net, .exit_batch_rtnl = vxlan_exit_batch_rtnl, .exit = vxlan_exit_net, .id = &vxlan_net_id, .size = sizeof(struct vxlan_net), }; static int __init vxlan_init_module(void) { int rc; get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); rc = register_pernet_subsys(&vxlan_net_ops); if (rc) goto out1; rc = register_netdevice_notifier(&vxlan_notifier_block); if (rc) goto out2; rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block); if (rc) goto out3; rc = rtnl_link_register(&vxlan_link_ops); if (rc) goto out4; rc = vxlan_vnifilter_init(); if (rc) goto out5; return 0; out5: rtnl_link_unregister(&vxlan_link_ops); out4: unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); out3: unregister_netdevice_notifier(&vxlan_notifier_block); out2: unregister_pernet_subsys(&vxlan_net_ops); out1: return rc; } late_initcall(vxlan_init_module); static void __exit vxlan_cleanup_module(void) { vxlan_vnifilter_uninit(); rtnl_link_unregister(&vxlan_link_ops); unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); unregister_netdevice_notifier(&vxlan_notifier_block); unregister_pernet_subsys(&vxlan_net_ops); /* rcu_barrier() is called by netns */ } module_exit(vxlan_cleanup_module); MODULE_LICENSE("GPL"); MODULE_VERSION(VXLAN_VERSION); MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>"); MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic"); MODULE_ALIAS_RTNL_LINK("vxlan"); |
93 94 24 69 18 17 73 77 16 16 35 35 1 18 18 36 11 11 6 9 17 11 7 22 4 1 1 12 1 6 5 8 8 1 13 1 11 1 103 8 95 95 13 20 82 32 3 1 28 1 17 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/eventfd.c * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * */ #include <linux/file.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/sched/signal.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/anon_inodes.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/kref.h> #include <linux/eventfd.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/idr.h> #include <linux/uio.h> static DEFINE_IDA(eventfd_ida); struct eventfd_ctx { struct kref kref; wait_queue_head_t wqh; /* * Every time that a write(2) is performed on an eventfd, the * value of the __u64 being written is added to "count" and a * wakeup is performed on "wqh". If EFD_SEMAPHORE flag was not * specified, a read(2) will return the "count" value to userspace, * and will reset "count" to zero. The kernel side eventfd_signal() * also, adds to the "count" counter and issue a wakeup. */ __u64 count; unsigned int flags; int id; }; /** * eventfd_signal_mask - Increment the event counter * @ctx: [in] Pointer to the eventfd context. * @mask: [in] poll mask * * This function is supposed to be called by the kernel in paths that do not * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX * value, and we signal this as overflow condition by returning a EPOLLERR * to poll(2). */ void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { unsigned long flags; /* * Deadlock or stack overflow issues can happen if we recurse here * through waitqueue wakeup handlers. If the caller users potentially * nested waitqueues with custom wakeup handlers, then it should * check eventfd_signal_allowed() before calling this function. If * it returns false, the eventfd_signal() call should be deferred to a * safe context. */ if (WARN_ON_ONCE(current->in_eventfd)) return; spin_lock_irqsave(&ctx->wqh.lock, flags); current->in_eventfd = 1; if (ctx->count < ULLONG_MAX) ctx->count++; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask); current->in_eventfd = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); } EXPORT_SYMBOL_GPL(eventfd_signal_mask); static void eventfd_free_ctx(struct eventfd_ctx *ctx) { if (ctx->id >= 0) ida_free(&eventfd_ida, ctx->id); kfree(ctx); } static void eventfd_free(struct kref *kref) { struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref); eventfd_free_ctx(ctx); } /** * eventfd_ctx_put - Releases a reference to the internal eventfd context. * @ctx: [in] Pointer to eventfd context. * * The eventfd context reference must have been previously acquired either * with eventfd_ctx_fdget() or eventfd_ctx_fileget(). */ void eventfd_ctx_put(struct eventfd_ctx *ctx) { kref_put(&ctx->kref, eventfd_free); } EXPORT_SYMBOL_GPL(eventfd_ctx_put); static int eventfd_release(struct inode *inode, struct file *file) { struct eventfd_ctx *ctx = file->private_data; wake_up_poll(&ctx->wqh, EPOLLHUP); eventfd_ctx_put(ctx); return 0; } static __poll_t eventfd_poll(struct file *file, poll_table *wait) { struct eventfd_ctx *ctx = file->private_data; __poll_t events = 0; u64 count; poll_wait(file, &ctx->wqh, wait); /* * All writes to ctx->count occur within ctx->wqh.lock. This read * can be done outside ctx->wqh.lock because we know that poll_wait * takes that lock (through add_wait_queue) if our caller will sleep. * * The read _can_ therefore seep into add_wait_queue's critical * section, but cannot move above it! add_wait_queue's spin_lock acts * as an acquire barrier and ensures that the read be ordered properly * against the writes. The following CAN happen and is safe: * * poll write * ----------------- ------------ * lock ctx->wqh.lock (in poll_wait) * count = ctx->count * __add_wait_queue * unlock ctx->wqh.lock * lock ctx->qwh.lock * ctx->count += n * if (waitqueue_active) * wake_up_locked_poll * unlock ctx->qwh.lock * eventfd_poll returns 0 * * but the following, which would miss a wakeup, cannot happen: * * poll write * ----------------- ------------ * count = ctx->count (INVALID!) * lock ctx->qwh.lock * ctx->count += n * **waitqueue_active is false** * **no wake_up_locked_poll!** * unlock ctx->qwh.lock * lock ctx->wqh.lock (in poll_wait) * __add_wait_queue * unlock ctx->wqh.lock * eventfd_poll returns 0 */ count = READ_ONCE(ctx->count); if (count > 0) events |= EPOLLIN; if (count == ULLONG_MAX) events |= EPOLLERR; if (ULLONG_MAX - 1 > count) events |= EPOLLOUT; return events; } void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) { lockdep_assert_held(&ctx->wqh.lock); *cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count; ctx->count -= *cnt; } EXPORT_SYMBOL_GPL(eventfd_ctx_do_read); /** * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. * @ctx: [in] Pointer to eventfd context. * @wait: [in] Wait queue to be removed. * @cnt: [out] Pointer to the 64-bit counter value. * * Returns %0 if successful, or the following error codes: * * -EAGAIN : The operation would have blocked. * * This is used to atomically remove a wait queue entry from the eventfd wait * queue head, and read/reset the counter value. */ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt) { unsigned long flags; spin_lock_irqsave(&ctx->wqh.lock, flags); eventfd_ctx_do_read(ctx, cnt); __remove_wait_queue(&ctx->wqh, wait); if (*cnt != 0 && waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLOUT); spin_unlock_irqrestore(&ctx->wqh.lock, flags); return *cnt != 0 ? 0 : -EAGAIN; } EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct eventfd_ctx *ctx = file->private_data; __u64 ucnt = 0; if (iov_iter_count(to) < sizeof(ucnt)) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); if (!ctx->count) { if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) { spin_unlock_irq(&ctx->wqh.lock); return -EAGAIN; } if (wait_event_interruptible_locked_irq(ctx->wqh, ctx->count)) { spin_unlock_irq(&ctx->wqh.lock); return -ERESTARTSYS; } } eventfd_ctx_do_read(ctx, &ucnt); current->in_eventfd = 1; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLOUT); current->in_eventfd = 0; spin_unlock_irq(&ctx->wqh.lock); if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt))) return -EFAULT; return sizeof(ucnt); } static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct eventfd_ctx *ctx = file->private_data; ssize_t res; __u64 ucnt; if (count != sizeof(ucnt)) return -EINVAL; if (copy_from_user(&ucnt, buf, sizeof(ucnt))) return -EFAULT; if (ucnt == ULLONG_MAX) return -EINVAL; spin_lock_irq(&ctx->wqh.lock); res = -EAGAIN; if (ULLONG_MAX - ctx->count > ucnt) res = sizeof(ucnt); else if (!(file->f_flags & O_NONBLOCK)) { res = wait_event_interruptible_locked_irq(ctx->wqh, ULLONG_MAX - ctx->count > ucnt); if (!res) res = sizeof(ucnt); } if (likely(res > 0)) { ctx->count += ucnt; current->in_eventfd = 1; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); current->in_eventfd = 0; } spin_unlock_irq(&ctx->wqh.lock); return res; } #ifdef CONFIG_PROC_FS static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) { struct eventfd_ctx *ctx = f->private_data; __u64 cnt; spin_lock_irq(&ctx->wqh.lock); cnt = ctx->count; spin_unlock_irq(&ctx->wqh.lock); seq_printf(m, "eventfd-count: %16llx\n" "eventfd-id: %d\n" "eventfd-semaphore: %d\n", cnt, ctx->id, !!(ctx->flags & EFD_SEMAPHORE)); } #endif static const struct file_operations eventfd_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = eventfd_show_fdinfo, #endif .release = eventfd_release, .poll = eventfd_poll, .read_iter = eventfd_read, .write = eventfd_write, .llseek = noop_llseek, }; /** * eventfd_fget - Acquire a reference of an eventfd file descriptor. * @fd: [in] Eventfd file descriptor. * * Returns a pointer to the eventfd file structure in case of success, or the * following error pointer: * * -EBADF : Invalid @fd file descriptor. * -EINVAL : The @fd file descriptor is not an eventfd file. */ struct file *eventfd_fget(int fd) { struct file *file; file = fget(fd); if (!file) return ERR_PTR(-EBADF); if (file->f_op != &eventfd_fops) { fput(file); return ERR_PTR(-EINVAL); } return file; } EXPORT_SYMBOL_GPL(eventfd_fget); /** * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context. * @fd: [in] Eventfd file descriptor. * * Returns a pointer to the internal eventfd context, otherwise the error * pointers returned by the following functions: * * eventfd_fget */ struct eventfd_ctx *eventfd_ctx_fdget(int fd) { CLASS(fd, f)(fd); if (fd_empty(f)) return ERR_PTR(-EBADF); return eventfd_ctx_fileget(fd_file(f)); } EXPORT_SYMBOL_GPL(eventfd_ctx_fdget); /** * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context. * @file: [in] Eventfd file pointer. * * Returns a pointer to the internal eventfd context, otherwise the error * pointer: * * -EINVAL : The @fd file descriptor is not an eventfd file. */ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) { struct eventfd_ctx *ctx; if (file->f_op != &eventfd_fops) return ERR_PTR(-EINVAL); ctx = file->private_data; kref_get(&ctx->kref); return ctx; } EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); static int do_eventfd(unsigned int count, int flags) { struct eventfd_ctx *ctx; struct file *file; int fd; /* Check the EFD_* constants for consistency. */ BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); BUILD_BUG_ON(EFD_SEMAPHORE != (1 << 0)); if (flags & ~EFD_FLAGS_SET) return -EINVAL; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; kref_init(&ctx->kref); init_waitqueue_head(&ctx->wqh); ctx->count = count; ctx->flags = flags; ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL); flags &= EFD_SHARED_FCNTL_FLAGS; flags |= O_RDWR; fd = get_unused_fd_flags(flags); if (fd < 0) goto err; file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags); if (IS_ERR(file)) { put_unused_fd(fd); fd = PTR_ERR(file); goto err; } file->f_mode |= FMODE_NOWAIT; fd_install(fd, file); return fd; err: eventfd_free_ctx(ctx); return fd; } SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) { return do_eventfd(count, flags); } SYSCALL_DEFINE1(eventfd, unsigned int, count) { return do_eventfd(count, 0); } |
8 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 | // SPDX-License-Identifier: GPL-2.0 // // Register cache access API // // Copyright 2011 Wolfson Microelectronics plc // // Author: Dimitris Papastamos <dp@opensource.wolfsonmicro.com> #include <linux/bsearch.h> #include <linux/device.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/sort.h> #include "trace.h" #include "internal.h" static const struct regcache_ops *cache_types[] = { ®cache_rbtree_ops, ®cache_maple_ops, ®cache_flat_ops, }; static int regcache_hw_init(struct regmap *map) { int i, j; int ret; int count; unsigned int reg, val; void *tmp_buf; if (!map->num_reg_defaults_raw) return -EINVAL; /* calculate the size of reg_defaults */ for (count = 0, i = 0; i < map->num_reg_defaults_raw; i++) if (regmap_readable(map, i * map->reg_stride) && !regmap_volatile(map, i * map->reg_stride)) count++; /* all registers are unreadable or volatile, so just bypass */ if (!count) { map->cache_bypass = true; return 0; } map->num_reg_defaults = count; map->reg_defaults = kmalloc_array(count, sizeof(struct reg_default), GFP_KERNEL); if (!map->reg_defaults) return -ENOMEM; if (!map->reg_defaults_raw) { bool cache_bypass = map->cache_bypass; dev_warn(map->dev, "No cache defaults, reading back from HW\n"); /* Bypass the cache access till data read from HW */ map->cache_bypass = true; tmp_buf = kmalloc(map->cache_size_raw, GFP_KERNEL); if (!tmp_buf) { ret = -ENOMEM; goto err_free; } ret = regmap_raw_read(map, 0, tmp_buf, map->cache_size_raw); map->cache_bypass = cache_bypass; if (ret == 0) { map->reg_defaults_raw = tmp_buf; map->cache_free = true; } else { kfree(tmp_buf); } } /* fill the reg_defaults */ for (i = 0, j = 0; i < map->num_reg_defaults_raw; i++) { reg = i * map->reg_stride; if (!regmap_readable(map, reg)) continue; if (regmap_volatile(map, reg)) continue; if (map->reg_defaults_raw) { val = regcache_get_val(map, map->reg_defaults_raw, i); } else { bool cache_bypass = map->cache_bypass; map->cache_bypass = true; ret = regmap_read(map, reg, &val); map->cache_bypass = cache_bypass; if (ret != 0) { dev_err(map->dev, "Failed to read %d: %d\n", reg, ret); goto err_free; } } map->reg_defaults[j].reg = reg; map->reg_defaults[j].def = val; j++; } return 0; err_free: kfree(map->reg_defaults); return ret; } int regcache_init(struct regmap *map, const struct regmap_config *config) { int ret; int i; void *tmp_buf; if (map->cache_type == REGCACHE_NONE) { if (config->reg_defaults || config->num_reg_defaults_raw) dev_warn(map->dev, "No cache used with register defaults set!\n"); map->cache_bypass = true; return 0; } if (config->reg_defaults && !config->num_reg_defaults) { dev_err(map->dev, "Register defaults are set without the number!\n"); return -EINVAL; } if (config->num_reg_defaults && !config->reg_defaults) { dev_err(map->dev, "Register defaults number are set without the reg!\n"); return -EINVAL; } for (i = 0; i < config->num_reg_defaults; i++) if (config->reg_defaults[i].reg % map->reg_stride) return -EINVAL; for (i = 0; i < ARRAY_SIZE(cache_types); i++) if (cache_types[i]->type == map->cache_type) break; if (i == ARRAY_SIZE(cache_types)) { dev_err(map->dev, "Could not match cache type: %d\n", map->cache_type); return -EINVAL; } map->num_reg_defaults = config->num_reg_defaults; map->num_reg_defaults_raw = config->num_reg_defaults_raw; map->reg_defaults_raw = config->reg_defaults_raw; map->cache_word_size = BITS_TO_BYTES(config->val_bits); map->cache_size_raw = map->cache_word_size * config->num_reg_defaults_raw; map->cache = NULL; map->cache_ops = cache_types[i]; if (!map->cache_ops->read || !map->cache_ops->write || !map->cache_ops->name) return -EINVAL; /* We still need to ensure that the reg_defaults * won't vanish from under us. We'll need to make * a copy of it. */ if (config->reg_defaults) { tmp_buf = kmemdup_array(config->reg_defaults, map->num_reg_defaults, sizeof(*map->reg_defaults), GFP_KERNEL); if (!tmp_buf) return -ENOMEM; map->reg_defaults = tmp_buf; } else if (map->num_reg_defaults_raw) { /* Some devices such as PMICs don't have cache defaults, * we cope with this by reading back the HW registers and * crafting the cache defaults by hand. */ ret = regcache_hw_init(map); if (ret < 0) return ret; if (map->cache_bypass) return 0; } if (!map->max_register_is_set && map->num_reg_defaults_raw) { map->max_register = (map->num_reg_defaults_raw - 1) * map->reg_stride; map->max_register_is_set = true; } if (map->cache_ops->init) { dev_dbg(map->dev, "Initializing %s cache\n", map->cache_ops->name); map->lock(map->lock_arg); ret = map->cache_ops->init(map); map->unlock(map->lock_arg); if (ret) goto err_free; } return 0; err_free: kfree(map->reg_defaults); if (map->cache_free) kfree(map->reg_defaults_raw); return ret; } void regcache_exit(struct regmap *map) { if (map->cache_type == REGCACHE_NONE) return; BUG_ON(!map->cache_ops); kfree(map->reg_defaults); if (map->cache_free) kfree(map->reg_defaults_raw); if (map->cache_ops->exit) { dev_dbg(map->dev, "Destroying %s cache\n", map->cache_ops->name); map->lock(map->lock_arg); map->cache_ops->exit(map); map->unlock(map->lock_arg); } } /** * regcache_read - Fetch the value of a given register from the cache. * * @map: map to configure. * @reg: The register index. * @value: The value to be returned. * * Return a negative value on failure, 0 on success. */ int regcache_read(struct regmap *map, unsigned int reg, unsigned int *value) { int ret; if (map->cache_type == REGCACHE_NONE) return -EINVAL; BUG_ON(!map->cache_ops); if (!regmap_volatile(map, reg)) { ret = map->cache_ops->read(map, reg, value); if (ret == 0) trace_regmap_reg_read_cache(map, reg, *value); return ret; } return -EINVAL; } /** * regcache_write - Set the value of a given register in the cache. * * @map: map to configure. * @reg: The register index. * @value: The new register value. * * Return a negative value on failure, 0 on success. */ int regcache_write(struct regmap *map, unsigned int reg, unsigned int value) { if (map->cache_type == REGCACHE_NONE) return 0; BUG_ON(!map->cache_ops); if (!regmap_volatile(map, reg)) return map->cache_ops->write(map, reg, value); return 0; } bool regcache_reg_needs_sync(struct regmap *map, unsigned int reg, unsigned int val) { int ret; if (!regmap_writeable(map, reg)) return false; /* If we don't know the chip just got reset, then sync everything. */ if (!map->no_sync_defaults) return true; /* Is this the hardware default? If so skip. */ ret = regcache_lookup_reg(map, reg); if (ret >= 0 && val == map->reg_defaults[ret].def) return false; return true; } static int regcache_default_sync(struct regmap *map, unsigned int min, unsigned int max) { unsigned int reg; for (reg = min; reg <= max; reg += map->reg_stride) { unsigned int val; int ret; if (regmap_volatile(map, reg) || !regmap_writeable(map, reg)) continue; ret = regcache_read(map, reg, &val); if (ret == -ENOENT) continue; if (ret) return ret; if (!regcache_reg_needs_sync(map, reg, val)) continue; map->cache_bypass = true; ret = _regmap_write(map, reg, val); map->cache_bypass = false; if (ret) { dev_err(map->dev, "Unable to sync register %#x. %d\n", reg, ret); return ret; } dev_dbg(map->dev, "Synced register %#x, value %#x\n", reg, val); } return 0; } static int rbtree_all(const void *key, const struct rb_node *node) { return 0; } /** * regcache_sync - Sync the register cache with the hardware. * * @map: map to configure. * * Any registers that should not be synced should be marked as * volatile. In general drivers can choose not to use the provided * syncing functionality if they so require. * * Return a negative value on failure, 0 on success. */ int regcache_sync(struct regmap *map) { int ret = 0; unsigned int i; const char *name; bool bypass; struct rb_node *node; if (WARN_ON(map->cache_type == REGCACHE_NONE)) return -EINVAL; BUG_ON(!map->cache_ops); map->lock(map->lock_arg); /* Remember the initial bypass state */ bypass = map->cache_bypass; dev_dbg(map->dev, "Syncing %s cache\n", map->cache_ops->name); name = map->cache_ops->name; trace_regcache_sync(map, name, "start"); if (!map->cache_dirty) goto out; /* Apply any patch first */ map->cache_bypass = true; for (i = 0; i < map->patch_regs; i++) { ret = _regmap_write(map, map->patch[i].reg, map->patch[i].def); if (ret != 0) { dev_err(map->dev, "Failed to write %x = %x: %d\n", map->patch[i].reg, map->patch[i].def, ret); goto out; } } map->cache_bypass = false; if (map->cache_ops->sync) ret = map->cache_ops->sync(map, 0, map->max_register); else ret = regcache_default_sync(map, 0, map->max_register); if (ret == 0) map->cache_dirty = false; out: /* Restore the bypass state */ map->cache_bypass = bypass; map->no_sync_defaults = false; /* * If we did any paging with cache bypassed and a cached * paging register then the register and cache state might * have gone out of sync, force writes of all the paging * registers. */ rb_for_each(node, NULL, &map->range_tree, rbtree_all) { struct regmap_range_node *this = rb_entry(node, struct regmap_range_node, node); /* If there's nothing in the cache there's nothing to sync */ if (regcache_read(map, this->selector_reg, &i) != 0) continue; ret = _regmap_write(map, this->selector_reg, i); if (ret != 0) { dev_err(map->dev, "Failed to write %x = %x: %d\n", this->selector_reg, i, ret); break; } } map->unlock(map->lock_arg); regmap_async_complete(map); trace_regcache_sync(map, name, "stop"); return ret; } EXPORT_SYMBOL_GPL(regcache_sync); /** * regcache_sync_region - Sync part of the register cache with the hardware. * * @map: map to sync. * @min: first register to sync * @max: last register to sync * * Write all non-default register values in the specified region to * the hardware. * * Return a negative value on failure, 0 on success. */ int regcache_sync_region(struct regmap *map, unsigned int min, unsigned int max) { int ret = 0; const char *name; bool bypass; if (WARN_ON(map->cache_type == REGCACHE_NONE)) return -EINVAL; BUG_ON(!map->cache_ops); map->lock(map->lock_arg); /* Remember the initial bypass state */ bypass = map->cache_bypass; name = map->cache_ops->name; dev_dbg(map->dev, "Syncing %s cache from %d-%d\n", name, min, max); trace_regcache_sync(map, name, "start region"); if (!map->cache_dirty) goto out; map->async = true; if (map->cache_ops->sync) ret = map->cache_ops->sync(map, min, max); else ret = regcache_default_sync(map, min, max); out: /* Restore the bypass state */ map->cache_bypass = bypass; map->async = false; map->no_sync_defaults = false; map->unlock(map->lock_arg); regmap_async_complete(map); trace_regcache_sync(map, name, "stop region"); return ret; } EXPORT_SYMBOL_GPL(regcache_sync_region); /** * regcache_drop_region - Discard part of the register cache * * @map: map to operate on * @min: first register to discard * @max: last register to discard * * Discard part of the register cache. * * Return a negative value on failure, 0 on success. */ int regcache_drop_region(struct regmap *map, unsigned int min, unsigned int max) { int ret = 0; if (!map->cache_ops || !map->cache_ops->drop) return -EINVAL; map->lock(map->lock_arg); trace_regcache_drop_region(map, min, max); ret = map->cache_ops->drop(map, min, max); map->unlock(map->lock_arg); return ret; } EXPORT_SYMBOL_GPL(regcache_drop_region); /** * regcache_cache_only - Put a register map into cache only mode * * @map: map to configure * @enable: flag if changes should be written to the hardware * * When a register map is marked as cache only writes to the register * map API will only update the register cache, they will not cause * any hardware changes. This is useful for allowing portions of * drivers to act as though the device were functioning as normal when * it is disabled for power saving reasons. */ void regcache_cache_only(struct regmap *map, bool enable) { map->lock(map->lock_arg); WARN_ON(map->cache_type != REGCACHE_NONE && map->cache_bypass && enable); map->cache_only = enable; trace_regmap_cache_only(map, enable); map->unlock(map->lock_arg); } EXPORT_SYMBOL_GPL(regcache_cache_only); /** * regcache_mark_dirty - Indicate that HW registers were reset to default values * * @map: map to mark * * Inform regcache that the device has been powered down or reset, so that * on resume, regcache_sync() knows to write out all non-default values * stored in the cache. * * If this function is not called, regcache_sync() will assume that * the hardware state still matches the cache state, modulo any writes that * happened when cache_only was true. */ void regcache_mark_dirty(struct regmap *map) { map->lock(map->lock_arg); map->cache_dirty = true; map->no_sync_defaults = true; map->unlock(map->lock_arg); } EXPORT_SYMBOL_GPL(regcache_mark_dirty); /** * regcache_cache_bypass - Put a register map into cache bypass mode * * @map: map to configure * @enable: flag if changes should not be written to the cache * * When a register map is marked with the cache bypass option, writes * to the register map API will only update the hardware and not * the cache directly. This is useful when syncing the cache back to * the hardware. */ void regcache_cache_bypass(struct regmap *map, bool enable) { map->lock(map->lock_arg); WARN_ON(map->cache_only && enable); map->cache_bypass = enable; trace_regmap_cache_bypass(map, enable); map->unlock(map->lock_arg); } EXPORT_SYMBOL_GPL(regcache_cache_bypass); /** * regcache_reg_cached - Check if a register is cached * * @map: map to check * @reg: register to check * * Reports if a register is cached. */ bool regcache_reg_cached(struct regmap *map, unsigned int reg) { unsigned int val; int ret; map->lock(map->lock_arg); ret = regcache_read(map, reg, &val); map->unlock(map->lock_arg); return ret == 0; } EXPORT_SYMBOL_GPL(regcache_reg_cached); void regcache_set_val(struct regmap *map, void *base, unsigned int idx, unsigned int val) { /* Use device native format if possible */ if (map->format.format_val) { map->format.format_val(base + (map->cache_word_size * idx), val, 0); return; } switch (map->cache_word_size) { case 1: { u8 *cache = base; cache[idx] = val; break; } case 2: { u16 *cache = base; cache[idx] = val; break; } case 4: { u32 *cache = base; cache[idx] = val; break; } default: BUG(); } } unsigned int regcache_get_val(struct regmap *map, const void *base, unsigned int idx) { if (!base) return -EINVAL; /* Use device native format if possible */ if (map->format.parse_val) return map->format.parse_val(regcache_get_val_addr(map, base, idx)); switch (map->cache_word_size) { case 1: { const u8 *cache = base; return cache[idx]; } case 2: { const u16 *cache = base; return cache[idx]; } case 4: { const u32 *cache = base; return cache[idx]; } default: BUG(); } /* unreachable */ return -1; } static int regcache_default_cmp(const void *a, const void *b) { const struct reg_default *_a = a; const struct reg_default *_b = b; return _a->reg - _b->reg; } int regcache_lookup_reg(struct regmap *map, unsigned int reg) { struct reg_default key; struct reg_default *r; key.reg = reg; key.def = 0; r = bsearch(&key, map->reg_defaults, map->num_reg_defaults, sizeof(struct reg_default), regcache_default_cmp); if (r) return r - map->reg_defaults; else return -ENOENT; } static bool regcache_reg_present(unsigned long *cache_present, unsigned int idx) { if (!cache_present) return true; return test_bit(idx, cache_present); } int regcache_sync_val(struct regmap *map, unsigned int reg, unsigned int val) { int ret; if (!regcache_reg_needs_sync(map, reg, val)) return 0; map->cache_bypass = true; ret = _regmap_write(map, reg, val); map->cache_bypass = false; if (ret != 0) { dev_err(map->dev, "Unable to sync register %#x. %d\n", reg, ret); return ret; } dev_dbg(map->dev, "Synced register %#x, value %#x\n", reg, val); return 0; } static int regcache_sync_block_single(struct regmap *map, void *block, unsigned long *cache_present, unsigned int block_base, unsigned int start, unsigned int end) { unsigned int i, regtmp, val; int ret; for (i = start; i < end; i++) { regtmp = block_base + (i * map->reg_stride); if (!regcache_reg_present(cache_present, i) || !regmap_writeable(map, regtmp)) continue; val = regcache_get_val(map, block, i); ret = regcache_sync_val(map, regtmp, val); if (ret != 0) return ret; } return 0; } static int regcache_sync_block_raw_flush(struct regmap *map, const void **data, unsigned int base, unsigned int cur) { size_t val_bytes = map->format.val_bytes; int ret, count; if (*data == NULL) return 0; count = (cur - base) / map->reg_stride; dev_dbg(map->dev, "Writing %zu bytes for %d registers from 0x%x-0x%x\n", count * val_bytes, count, base, cur - map->reg_stride); map->cache_bypass = true; ret = _regmap_raw_write(map, base, *data, count * val_bytes, false); if (ret) dev_err(map->dev, "Unable to sync registers %#x-%#x. %d\n", base, cur - map->reg_stride, ret); map->cache_bypass = false; *data = NULL; return ret; } static int regcache_sync_block_raw(struct regmap *map, void *block, unsigned long *cache_present, unsigned int block_base, unsigned int start, unsigned int end) { unsigned int i, val; unsigned int regtmp = 0; unsigned int base = 0; const void *data = NULL; int ret; for (i = start; i < end; i++) { regtmp = block_base + (i * map->reg_stride); if (!regcache_reg_present(cache_present, i) || !regmap_writeable(map, regtmp)) { ret = regcache_sync_block_raw_flush(map, &data, base, regtmp); if (ret != 0) return ret; continue; } val = regcache_get_val(map, block, i); if (!regcache_reg_needs_sync(map, regtmp, val)) { ret = regcache_sync_block_raw_flush(map, &data, base, regtmp); if (ret != 0) return ret; continue; } if (!data) { data = regcache_get_val_addr(map, block, i); base = regtmp; } } return regcache_sync_block_raw_flush(map, &data, base, regtmp + map->reg_stride); } int regcache_sync_block(struct regmap *map, void *block, unsigned long *cache_present, unsigned int block_base, unsigned int start, unsigned int end) { if (regmap_can_raw_write(map) && !map->use_single_write) return regcache_sync_block_raw(map, block, cache_present, block_base, start, end); else return regcache_sync_block_single(map, block, cache_present, block_base, start, end); } |
48 48 48 48 43 43 10 4 1 2 1 1 3 3 3 3 2 3 27 2 1 1 25 25 25 1 1 16 1 9 10 15 8 8 4 2 1 1 1 12 12 3 4 5 42 13 29 4 13 75 9 4 62 18 2 13 28 1 7 5 35 37 11 37 37 8 8 6 2 8 3 2 1 1 1 5 4 1 2 1 46 37 2 6 4 2 4 2 1 3 1 2 2 5 5 6 2 3 1 1 1 4 2 1 1 1 61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 | // SPDX-License-Identifier: GPL-2.0-or-later /***************************************************************************** * Linux PPP over L2TP (PPPoX/PPPoL2TP) Sockets * * PPPoX --- Generic PPP encapsulation socket family * PPPoL2TP --- PPP over L2TP (RFC 2661) * * Version: 2.0.0 * * Authors: James Chapman (jchapman@katalix.com) * * Based on original work by Martijn van Oosterhout <kleptog@svana.org> * * License: */ /* This driver handles only L2TP data frames; control frames are handled by a * userspace application. * * To send data in an L2TP session, userspace opens a PPPoL2TP socket and * attaches it to a bound UDP socket with local tunnel_id / session_id and * peer tunnel_id / session_id set. Data can then be sent or received using * regular socket sendmsg() / recvmsg() calls. Kernel parameters of the socket * can be read or modified using ioctl() or [gs]etsockopt() calls. * * When a PPPoL2TP socket is connected with local and peer session_id values * zero, the socket is treated as a special tunnel management socket. * * Here's example userspace code to create a socket for sending/receiving data * over an L2TP session:- * * struct sockaddr_pppol2tp sax; * int fd; * int session_fd; * * fd = socket(AF_PPPOX, SOCK_DGRAM, PX_PROTO_OL2TP); * * sax.sa_family = AF_PPPOX; * sax.sa_protocol = PX_PROTO_OL2TP; * sax.pppol2tp.fd = tunnel_fd; // bound UDP socket * sax.pppol2tp.addr.sin_addr.s_addr = addr->sin_addr.s_addr; * sax.pppol2tp.addr.sin_port = addr->sin_port; * sax.pppol2tp.addr.sin_family = AF_INET; * sax.pppol2tp.s_tunnel = tunnel_id; * sax.pppol2tp.s_session = session_id; * sax.pppol2tp.d_tunnel = peer_tunnel_id; * sax.pppol2tp.d_session = peer_session_id; * * session_fd = connect(fd, (struct sockaddr *)&sax, sizeof(sax)); * * A pppd plugin that allows PPP traffic to be carried over L2TP using * this driver is available from the OpenL2TP project at * http://openl2tp.sourceforge.net. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/string.h> #include <linux/list.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/spinlock.h> #include <linux/kthread.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/inetdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/udp.h> #include <linux/if_pppox.h> #include <linux/if_pppol2tp.h> #include <net/sock.h> #include <linux/ppp_channel.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/file.h> #include <linux/hash.h> #include <linux/sort.h> #include <linux/proc_fs.h> #include <linux/l2tp.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/ip.h> #include <net/udp.h> #include <net/inet_common.h> #include <asm/byteorder.h> #include <linux/atomic.h> #include "l2tp_core.h" #define PPPOL2TP_DRV_VERSION "V2.0" /* Space for UDP, L2TP and PPP headers */ #define PPPOL2TP_HEADER_OVERHEAD 40 /* Number of bytes to build transmit L2TP headers. * Unfortunately the size is different depending on whether sequence numbers * are enabled. */ #define PPPOL2TP_L2TP_HDR_SIZE_SEQ 10 #define PPPOL2TP_L2TP_HDR_SIZE_NOSEQ 6 /* Private data of each session. This data lives at the end of struct * l2tp_session, referenced via session->priv[]. */ struct pppol2tp_session { int owner; /* pid that opened the socket */ struct mutex sk_lock; /* Protects .sk */ struct sock __rcu *sk; /* Pointer to the session PPPoX socket */ struct sock *__sk; /* Copy of .sk, for cleanup */ }; static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb); static const struct ppp_channel_ops pppol2tp_chan_ops = { .start_xmit = pppol2tp_xmit, }; static const struct proto_ops pppol2tp_ops; /* Retrieves the pppol2tp socket associated to a session. * A reference is held on the returned socket, so this function must be paired * with sock_put(). */ static struct sock *pppol2tp_session_get_sock(struct l2tp_session *session) { struct pppol2tp_session *ps = l2tp_session_priv(session); struct sock *sk; rcu_read_lock(); sk = rcu_dereference(ps->sk); if (sk) sock_hold(sk); rcu_read_unlock(); return sk; } /* Helpers to obtain tunnel/session contexts from sockets. */ static struct l2tp_session *pppol2tp_sock_to_session(struct sock *sk) { struct l2tp_session *session; if (!sk) return NULL; rcu_read_lock(); session = rcu_dereference_sk_user_data(sk); if (session && refcount_inc_not_zero(&session->ref_count)) { rcu_read_unlock(); WARN_ON_ONCE(session->magic != L2TP_SESSION_MAGIC); return session; } rcu_read_unlock(); return NULL; } /***************************************************************************** * Receive data handling *****************************************************************************/ /* Receive message. This is the recvmsg for the PPPoL2TP socket. */ static int pppol2tp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { int err; struct sk_buff *skb; struct sock *sk = sock->sk; err = -EIO; if (sk->sk_state & PPPOX_BOUND) goto end; err = 0; skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto end; if (len > skb->len) len = skb->len; else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; err = skb_copy_datagram_msg(skb, 0, msg, len); if (likely(err == 0)) err = len; kfree_skb(skb); end: return err; } static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) { struct pppol2tp_session *ps = l2tp_session_priv(session); struct sock *sk = NULL; /* If the socket is bound, send it in to PPP's input queue. Otherwise * queue it on the session socket. */ rcu_read_lock(); sk = rcu_dereference(ps->sk); if (!sk) goto no_sock; /* If the first two bytes are 0xFF03, consider that it is the PPP's * Address and Control fields and skip them. The L2TP module has always * worked this way, although, in theory, the use of these fields should * be negotiated and handled at the PPP layer. These fields are * constant: 0xFF is the All-Stations Address and 0x03 the Unnumbered * Information command with Poll/Final bit set to zero (RFC 1662). */ if (pskb_may_pull(skb, 2) && skb->data[0] == PPP_ALLSTATIONS && skb->data[1] == PPP_UI) skb_pull(skb, 2); if (sk->sk_state & PPPOX_BOUND) { struct pppox_sock *po; po = pppox_sk(sk); ppp_input(&po->chan, skb); } else { if (sock_queue_rcv_skb(sk, skb) < 0) { atomic_long_inc(&session->stats.rx_errors); kfree_skb(skb); } } rcu_read_unlock(); return; no_sock: rcu_read_unlock(); pr_warn_ratelimited("%s: no socket in recv\n", session->name); kfree_skb(skb); } /************************************************************************ * Transmit handling ***********************************************************************/ /* This is the sendmsg for the PPPoL2TP pppol2tp_session socket. We come here * when a user application does a sendmsg() on the session socket. L2TP and * PPP headers must be inserted into the user's data. */ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct sock *sk = sock->sk; struct sk_buff *skb; int error; struct l2tp_session *session; struct l2tp_tunnel *tunnel; int uhlen; error = -ENOTCONN; if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) goto error; /* Get session and tunnel contexts */ error = -EBADF; session = pppol2tp_sock_to_session(sk); if (!session) goto error; tunnel = session->tunnel; uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; /* Allocate a socket buffer */ error = -ENOMEM; skb = sock_wmalloc(sk, NET_SKB_PAD + sizeof(struct iphdr) + uhlen + session->hdr_len + 2 + total_len, /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ 0, GFP_KERNEL); if (!skb) goto error_put_sess; /* Reserve space for headers. */ skb_reserve(skb, NET_SKB_PAD); skb_reset_network_header(skb); skb_reserve(skb, sizeof(struct iphdr)); skb_reset_transport_header(skb); skb_reserve(skb, uhlen); /* Add PPP header */ skb->data[0] = PPP_ALLSTATIONS; skb->data[1] = PPP_UI; skb_put(skb, 2); /* Copy user data into skb */ error = memcpy_from_msg(skb_put(skb, total_len), m, total_len); if (error < 0) { kfree_skb(skb); goto error_put_sess; } local_bh_disable(); l2tp_xmit_skb(session, skb); local_bh_enable(); l2tp_session_put(session); return total_len; error_put_sess: l2tp_session_put(session); error: return error; } /* Transmit function called by generic PPP driver. Sends PPP frame * over PPPoL2TP socket. * * This is almost the same as pppol2tp_sendmsg(), but rather than * being called with a msghdr from userspace, it is called with a skb * from the kernel. * * The supplied skb from ppp doesn't have enough headroom for the * insertion of L2TP, UDP and IP headers so we need to allocate more * headroom in the skb. This will create a cloned skb. But we must be * careful in the error case because the caller will expect to free * the skb it supplied, not our cloned skb. So we take care to always * leave the original skb unfreed if we return an error. */ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb) { struct sock *sk = (struct sock *)chan->private; struct l2tp_session *session; struct l2tp_tunnel *tunnel; int uhlen, headroom; if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) goto abort; /* Get session and tunnel contexts from the socket */ session = pppol2tp_sock_to_session(sk); if (!session) goto abort; tunnel = session->tunnel; uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0; headroom = NET_SKB_PAD + sizeof(struct iphdr) + /* IP header */ uhlen + /* UDP header (if L2TP_ENCAPTYPE_UDP) */ session->hdr_len + /* L2TP header */ 2; /* 2 bytes for PPP_ALLSTATIONS & PPP_UI */ if (skb_cow_head(skb, headroom)) goto abort_put_sess; /* Setup PPP header */ __skb_push(skb, 2); skb->data[0] = PPP_ALLSTATIONS; skb->data[1] = PPP_UI; local_bh_disable(); l2tp_xmit_skb(session, skb); local_bh_enable(); l2tp_session_put(session); return 1; abort_put_sess: l2tp_session_put(session); abort: /* Free the original skb */ kfree_skb(skb); return 1; } /***************************************************************************** * Session (and tunnel control) socket create/destroy. *****************************************************************************/ /* Really kill the session socket. (Called from sock_put() if * refcnt == 0.) */ static void pppol2tp_session_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } static void pppol2tp_session_close(struct l2tp_session *session) { struct pppol2tp_session *ps; ps = l2tp_session_priv(session); mutex_lock(&ps->sk_lock); ps->__sk = rcu_dereference_protected(ps->sk, lockdep_is_held(&ps->sk_lock)); RCU_INIT_POINTER(ps->sk, NULL); mutex_unlock(&ps->sk_lock); if (ps->__sk) { /* detach socket */ rcu_assign_sk_user_data(ps->__sk, NULL); sock_put(ps->__sk); /* drop ref taken when we referenced socket via sk_user_data */ l2tp_session_put(session); } } /* Called when the PPPoX socket (session) is closed. */ static int pppol2tp_release(struct socket *sock) { struct sock *sk = sock->sk; struct l2tp_session *session; int error; if (!sk) return 0; error = -EBADF; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD) != 0) goto error; pppox_unbind_sock(sk); /* Signal the death of the socket. */ sk->sk_state = PPPOX_DEAD; sock_orphan(sk); sock->sk = NULL; session = pppol2tp_sock_to_session(sk); if (session) { l2tp_session_delete(session); /* drop ref taken by pppol2tp_sock_to_session */ l2tp_session_put(session); } release_sock(sk); sock_put(sk); return 0; error: release_sock(sk); return error; } static struct proto pppol2tp_sk_proto = { .name = "PPPOL2TP", .owner = THIS_MODULE, .obj_size = sizeof(struct pppox_sock), }; static int pppol2tp_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; rc = l2tp_udp_encap_recv(sk, skb); if (rc) kfree_skb(skb); return NET_RX_SUCCESS; } /* socket() handler. Initialize a new struct sock. */ static int pppol2tp_create(struct net *net, struct socket *sock, int kern) { int error = -ENOMEM; struct sock *sk; sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto, kern); if (!sk) goto out; sock_init_data(sock, sk); sock_set_flag(sk, SOCK_RCU_FREE); sock->state = SS_UNCONNECTED; sock->ops = &pppol2tp_ops; sk->sk_backlog_rcv = pppol2tp_backlog_recv; sk->sk_protocol = PX_PROTO_OL2TP; sk->sk_family = PF_PPPOX; sk->sk_state = PPPOX_NONE; sk->sk_type = SOCK_STREAM; sk->sk_destruct = pppol2tp_session_destruct; error = 0; out: return error; } static void pppol2tp_show(struct seq_file *m, void *arg) { struct l2tp_session *session = arg; struct sock *sk; sk = pppol2tp_session_get_sock(session); if (sk) { struct pppox_sock *po = pppox_sk(sk); seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); sock_put(sk); } } static void pppol2tp_session_init(struct l2tp_session *session) { struct pppol2tp_session *ps; session->recv_skb = pppol2tp_recv; session->session_close = pppol2tp_session_close; if (IS_ENABLED(CONFIG_L2TP_DEBUGFS)) session->show = pppol2tp_show; ps = l2tp_session_priv(session); mutex_init(&ps->sk_lock); ps->owner = current->pid; } struct l2tp_connect_info { u8 version; int fd; u32 tunnel_id; u32 peer_tunnel_id; u32 session_id; u32 peer_session_id; }; static int pppol2tp_sockaddr_get_info(const void *sa, int sa_len, struct l2tp_connect_info *info) { switch (sa_len) { case sizeof(struct sockaddr_pppol2tp): { const struct sockaddr_pppol2tp *sa_v2in4 = sa; if (sa_v2in4->sa_protocol != PX_PROTO_OL2TP) return -EINVAL; info->version = 2; info->fd = sa_v2in4->pppol2tp.fd; info->tunnel_id = sa_v2in4->pppol2tp.s_tunnel; info->peer_tunnel_id = sa_v2in4->pppol2tp.d_tunnel; info->session_id = sa_v2in4->pppol2tp.s_session; info->peer_session_id = sa_v2in4->pppol2tp.d_session; break; } case sizeof(struct sockaddr_pppol2tpv3): { const struct sockaddr_pppol2tpv3 *sa_v3in4 = sa; if (sa_v3in4->sa_protocol != PX_PROTO_OL2TP) return -EINVAL; info->version = 3; info->fd = sa_v3in4->pppol2tp.fd; info->tunnel_id = sa_v3in4->pppol2tp.s_tunnel; info->peer_tunnel_id = sa_v3in4->pppol2tp.d_tunnel; info->session_id = sa_v3in4->pppol2tp.s_session; info->peer_session_id = sa_v3in4->pppol2tp.d_session; break; } case sizeof(struct sockaddr_pppol2tpin6): { const struct sockaddr_pppol2tpin6 *sa_v2in6 = sa; if (sa_v2in6->sa_protocol != PX_PROTO_OL2TP) return -EINVAL; info->version = 2; info->fd = sa_v2in6->pppol2tp.fd; info->tunnel_id = sa_v2in6->pppol2tp.s_tunnel; info->peer_tunnel_id = sa_v2in6->pppol2tp.d_tunnel; info->session_id = sa_v2in6->pppol2tp.s_session; info->peer_session_id = sa_v2in6->pppol2tp.d_session; break; } case sizeof(struct sockaddr_pppol2tpv3in6): { const struct sockaddr_pppol2tpv3in6 *sa_v3in6 = sa; if (sa_v3in6->sa_protocol != PX_PROTO_OL2TP) return -EINVAL; info->version = 3; info->fd = sa_v3in6->pppol2tp.fd; info->tunnel_id = sa_v3in6->pppol2tp.s_tunnel; info->peer_tunnel_id = sa_v3in6->pppol2tp.d_tunnel; info->session_id = sa_v3in6->pppol2tp.s_session; info->peer_session_id = sa_v3in6->pppol2tp.d_session; break; } default: return -EINVAL; } return 0; } /* Rough estimation of the maximum payload size a tunnel can transmit without * fragmenting at the lower IP layer. Assumes L2TPv2 with sequence * numbers and no IP option. Not quite accurate, but the result is mostly * unused anyway. */ static int pppol2tp_tunnel_mtu(const struct l2tp_tunnel *tunnel) { int mtu; mtu = l2tp_tunnel_dst_mtu(tunnel); if (mtu <= PPPOL2TP_HEADER_OVERHEAD) return 1500 - PPPOL2TP_HEADER_OVERHEAD; return mtu - PPPOL2TP_HEADER_OVERHEAD; } static struct l2tp_tunnel *pppol2tp_tunnel_get(struct net *net, const struct l2tp_connect_info *info, bool *new_tunnel) { struct l2tp_tunnel *tunnel; int error; *new_tunnel = false; tunnel = l2tp_tunnel_get(net, info->tunnel_id); /* Special case: create tunnel context if session_id and * peer_session_id is 0. Otherwise look up tunnel using supplied * tunnel id. */ if (!info->session_id && !info->peer_session_id) { if (!tunnel) { struct l2tp_tunnel_cfg tcfg = { .encap = L2TP_ENCAPTYPE_UDP, }; /* Prevent l2tp_tunnel_register() from trying to set up * a kernel socket. */ if (info->fd < 0) return ERR_PTR(-EBADF); error = l2tp_tunnel_create(info->fd, info->version, info->tunnel_id, info->peer_tunnel_id, &tcfg, &tunnel); if (error < 0) return ERR_PTR(error); refcount_inc(&tunnel->ref_count); error = l2tp_tunnel_register(tunnel, net, &tcfg); if (error < 0) { kfree(tunnel); return ERR_PTR(error); } *new_tunnel = true; } } else { /* Error if we can't find the tunnel */ if (!tunnel) return ERR_PTR(-ENOENT); /* Error if socket is not prepped */ if (!tunnel->sock) { l2tp_tunnel_put(tunnel); return ERR_PTR(-ENOENT); } } return tunnel; } /* connect() handler. Attach a PPPoX socket to a tunnel UDP socket */ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); struct l2tp_session *session = NULL; struct l2tp_connect_info info; struct l2tp_tunnel *tunnel; struct pppol2tp_session *ps; struct l2tp_session_cfg cfg = { 0, }; bool drop_refcnt = false; bool new_session = false; bool new_tunnel = false; int error; error = pppol2tp_sockaddr_get_info(uservaddr, sockaddr_len, &info); if (error < 0) return error; /* Don't bind if tunnel_id is 0 */ if (!info.tunnel_id) return -EINVAL; tunnel = pppol2tp_tunnel_get(sock_net(sk), &info, &new_tunnel); if (IS_ERR(tunnel)) return PTR_ERR(tunnel); lock_sock(sk); /* Check for already bound sockets */ error = -EBUSY; if (sk->sk_state & PPPOX_CONNECTED) goto end; /* We don't supporting rebinding anyway */ error = -EALREADY; if (sk->sk_user_data) goto end; /* socket is already attached */ if (tunnel->peer_tunnel_id == 0) tunnel->peer_tunnel_id = info.peer_tunnel_id; session = l2tp_session_get(sock_net(sk), tunnel->sock, tunnel->version, info.tunnel_id, info.session_id); if (session) { drop_refcnt = true; if (session->pwtype != L2TP_PWTYPE_PPP) { error = -EPROTOTYPE; goto end; } ps = l2tp_session_priv(session); /* Using a pre-existing session is fine as long as it hasn't * been connected yet. */ mutex_lock(&ps->sk_lock); if (rcu_dereference_protected(ps->sk, lockdep_is_held(&ps->sk_lock)) || ps->__sk) { mutex_unlock(&ps->sk_lock); error = -EEXIST; goto end; } } else { cfg.pw_type = L2TP_PWTYPE_PPP; session = l2tp_session_create(sizeof(struct pppol2tp_session), tunnel, info.session_id, info.peer_session_id, &cfg); if (IS_ERR(session)) { error = PTR_ERR(session); goto end; } drop_refcnt = true; pppol2tp_session_init(session); ps = l2tp_session_priv(session); refcount_inc(&session->ref_count); mutex_lock(&ps->sk_lock); error = l2tp_session_register(session, tunnel); if (error < 0) { mutex_unlock(&ps->sk_lock); l2tp_session_put(session); goto end; } new_session = true; } /* Special case: if source & dest session_id == 0x0000, this * socket is being created to manage the tunnel. Just set up * the internal context for use by ioctl() and sockopt() * handlers. */ if (session->session_id == 0 && session->peer_session_id == 0) { error = 0; goto out_no_ppp; } /* The only header we need to worry about is the L2TP * header. This size is different depending on whether * sequence numbers are enabled for the data channel. */ po->chan.hdrlen = PPPOL2TP_L2TP_HDR_SIZE_NOSEQ; po->chan.private = sk; po->chan.ops = &pppol2tp_chan_ops; po->chan.mtu = pppol2tp_tunnel_mtu(tunnel); error = ppp_register_net_channel(sock_net(sk), &po->chan); if (error) { mutex_unlock(&ps->sk_lock); goto end; } out_no_ppp: /* This is how we get the session context from the socket. */ sock_hold(sk); rcu_assign_sk_user_data(sk, session); rcu_assign_pointer(ps->sk, sk); mutex_unlock(&ps->sk_lock); /* Keep the reference we've grabbed on the session: sk doesn't expect * the session to disappear. pppol2tp_session_close() is responsible * for dropping it. */ drop_refcnt = false; sk->sk_state = PPPOX_CONNECTED; end: if (error) { if (new_session) l2tp_session_delete(session); if (new_tunnel) l2tp_tunnel_delete(tunnel); } if (drop_refcnt) l2tp_session_put(session); l2tp_tunnel_put(tunnel); release_sock(sk); return error; } #ifdef CONFIG_L2TP_V3 /* Called when creating sessions via the netlink interface. */ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) { int error; struct l2tp_session *session; /* Error if tunnel socket is not prepped */ if (!tunnel->sock) { error = -ENOENT; goto err; } /* Allocate and initialize a new session context. */ session = l2tp_session_create(sizeof(struct pppol2tp_session), tunnel, session_id, peer_session_id, cfg); if (IS_ERR(session)) { error = PTR_ERR(session); goto err; } pppol2tp_session_init(session); error = l2tp_session_register(session, tunnel); if (error < 0) goto err_sess; return 0; err_sess: l2tp_session_put(session); err: return error; } #endif /* CONFIG_L2TP_V3 */ /* getname() support. */ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { int len = 0; int error = 0; struct l2tp_session *session; struct l2tp_tunnel *tunnel; struct sock *sk = sock->sk; struct inet_sock *inet; struct pppol2tp_session *pls; error = -ENOTCONN; if (!sk) goto end; if (!(sk->sk_state & PPPOX_CONNECTED)) goto end; error = -EBADF; session = pppol2tp_sock_to_session(sk); if (!session) goto end; pls = l2tp_session_priv(session); tunnel = session->tunnel; inet = inet_sk(tunnel->sock); if (tunnel->version == 2 && tunnel->sock->sk_family == AF_INET) { struct sockaddr_pppol2tp sp; len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OL2TP; sp.pppol2tp.fd = tunnel->fd; sp.pppol2tp.pid = pls->owner; sp.pppol2tp.s_tunnel = tunnel->tunnel_id; sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id; sp.pppol2tp.s_session = session->session_id; sp.pppol2tp.d_session = session->peer_session_id; sp.pppol2tp.addr.sin_family = AF_INET; sp.pppol2tp.addr.sin_port = inet->inet_dport; sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr; memcpy(uaddr, &sp, len); #if IS_ENABLED(CONFIG_IPV6) } else if (tunnel->version == 2 && tunnel->sock->sk_family == AF_INET6) { struct sockaddr_pppol2tpin6 sp; len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OL2TP; sp.pppol2tp.fd = tunnel->fd; sp.pppol2tp.pid = pls->owner; sp.pppol2tp.s_tunnel = tunnel->tunnel_id; sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id; sp.pppol2tp.s_session = session->session_id; sp.pppol2tp.d_session = session->peer_session_id; sp.pppol2tp.addr.sin6_family = AF_INET6; sp.pppol2tp.addr.sin6_port = inet->inet_dport; memcpy(&sp.pppol2tp.addr.sin6_addr, &tunnel->sock->sk_v6_daddr, sizeof(tunnel->sock->sk_v6_daddr)); memcpy(uaddr, &sp, len); } else if (tunnel->version == 3 && tunnel->sock->sk_family == AF_INET6) { struct sockaddr_pppol2tpv3in6 sp; len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OL2TP; sp.pppol2tp.fd = tunnel->fd; sp.pppol2tp.pid = pls->owner; sp.pppol2tp.s_tunnel = tunnel->tunnel_id; sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id; sp.pppol2tp.s_session = session->session_id; sp.pppol2tp.d_session = session->peer_session_id; sp.pppol2tp.addr.sin6_family = AF_INET6; sp.pppol2tp.addr.sin6_port = inet->inet_dport; memcpy(&sp.pppol2tp.addr.sin6_addr, &tunnel->sock->sk_v6_daddr, sizeof(tunnel->sock->sk_v6_daddr)); memcpy(uaddr, &sp, len); #endif } else if (tunnel->version == 3) { struct sockaddr_pppol2tpv3 sp; len = sizeof(sp); memset(&sp, 0, len); sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OL2TP; sp.pppol2tp.fd = tunnel->fd; sp.pppol2tp.pid = pls->owner; sp.pppol2tp.s_tunnel = tunnel->tunnel_id; sp.pppol2tp.d_tunnel = tunnel->peer_tunnel_id; sp.pppol2tp.s_session = session->session_id; sp.pppol2tp.d_session = session->peer_session_id; sp.pppol2tp.addr.sin_family = AF_INET; sp.pppol2tp.addr.sin_port = inet->inet_dport; sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr; memcpy(uaddr, &sp, len); } error = len; l2tp_session_put(session); end: return error; } /**************************************************************************** * ioctl() handlers. * * The PPPoX socket is created for L2TP sessions: tunnels have their own UDP * sockets. However, in order to control kernel tunnel features, we allow * userspace to create a special "tunnel" PPPoX socket which is used for * control only. Tunnel PPPoX sockets have session_id == 0 and simply allow * the user application to issue L2TP setsockopt(), getsockopt() and ioctl() * calls. ****************************************************************************/ static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest, const struct l2tp_stats *stats) { memset(dest, 0, sizeof(*dest)); dest->tx_packets = atomic_long_read(&stats->tx_packets); dest->tx_bytes = atomic_long_read(&stats->tx_bytes); dest->tx_errors = atomic_long_read(&stats->tx_errors); dest->rx_packets = atomic_long_read(&stats->rx_packets); dest->rx_bytes = atomic_long_read(&stats->rx_bytes); dest->rx_seq_discards = atomic_long_read(&stats->rx_seq_discards); dest->rx_oos_packets = atomic_long_read(&stats->rx_oos_packets); dest->rx_errors = atomic_long_read(&stats->rx_errors); } static int pppol2tp_tunnel_copy_stats(struct pppol2tp_ioc_stats *stats, struct l2tp_tunnel *tunnel) { struct l2tp_session *session; if (!stats->session_id) { pppol2tp_copy_stats(stats, &tunnel->stats); return 0; } /* If session_id is set, search the corresponding session in the * context of this tunnel and record the session's statistics. */ session = l2tp_session_get(tunnel->l2tp_net, tunnel->sock, tunnel->version, tunnel->tunnel_id, stats->session_id); if (!session) return -EBADR; if (session->pwtype != L2TP_PWTYPE_PPP) { l2tp_session_put(session); return -EBADR; } pppol2tp_copy_stats(stats, &session->stats); l2tp_session_put(session); return 0; } static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct pppol2tp_ioc_stats stats; struct l2tp_session *session; switch (cmd) { case PPPIOCGMRU: case PPPIOCGFLAGS: session = sock->sk->sk_user_data; if (!session) return -ENOTCONN; if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) return -EBADF; /* Not defined for tunnels */ if (!session->session_id && !session->peer_session_id) return -ENOSYS; if (put_user(0, (int __user *)arg)) return -EFAULT; break; case PPPIOCSMRU: case PPPIOCSFLAGS: session = sock->sk->sk_user_data; if (!session) return -ENOTCONN; if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) return -EBADF; /* Not defined for tunnels */ if (!session->session_id && !session->peer_session_id) return -ENOSYS; if (!access_ok((int __user *)arg, sizeof(int))) return -EFAULT; break; case PPPIOCGL2TPSTATS: session = sock->sk->sk_user_data; if (!session) return -ENOTCONN; if (WARN_ON(session->magic != L2TP_SESSION_MAGIC)) return -EBADF; /* Session 0 represents the parent tunnel */ if (!session->session_id && !session->peer_session_id) { u32 session_id; int err; if (copy_from_user(&stats, (void __user *)arg, sizeof(stats))) return -EFAULT; session_id = stats.session_id; err = pppol2tp_tunnel_copy_stats(&stats, session->tunnel); if (err < 0) return err; stats.session_id = session_id; } else { pppol2tp_copy_stats(&stats, &session->stats); stats.session_id = session->session_id; } stats.tunnel_id = session->tunnel->tunnel_id; stats.using_ipsec = l2tp_tunnel_uses_xfrm(session->tunnel); if (copy_to_user((void __user *)arg, &stats, sizeof(stats))) return -EFAULT; break; default: return -ENOIOCTLCMD; } return 0; } /***************************************************************************** * setsockopt() / getsockopt() support. * * The PPPoX socket is created for L2TP sessions: tunnels have their own UDP * sockets. In order to control kernel tunnel features, we allow userspace to * create a special "tunnel" PPPoX socket which is used for control only. * Tunnel PPPoX sockets have session_id == 0 and simply allow the user * application to issue L2TP setsockopt(), getsockopt() and ioctl() calls. *****************************************************************************/ /* Tunnel setsockopt() helper. */ static int pppol2tp_tunnel_setsockopt(struct sock *sk, struct l2tp_tunnel *tunnel, int optname, int val) { int err = 0; switch (optname) { case PPPOL2TP_SO_DEBUG: /* Tunnel debug flags option is deprecated */ break; default: err = -ENOPROTOOPT; break; } return err; } /* Session setsockopt helper. */ static int pppol2tp_session_setsockopt(struct sock *sk, struct l2tp_session *session, int optname, int val) { int err = 0; switch (optname) { case PPPOL2TP_SO_RECVSEQ: if (val != 0 && val != 1) { err = -EINVAL; break; } session->recv_seq = !!val; break; case PPPOL2TP_SO_SENDSEQ: if (val != 0 && val != 1) { err = -EINVAL; break; } session->send_seq = !!val; { struct pppox_sock *po = pppox_sk(sk); po->chan.hdrlen = val ? PPPOL2TP_L2TP_HDR_SIZE_SEQ : PPPOL2TP_L2TP_HDR_SIZE_NOSEQ; } l2tp_session_set_header_len(session, session->tunnel->version, session->tunnel->encap); break; case PPPOL2TP_SO_LNSMODE: if (val != 0 && val != 1) { err = -EINVAL; break; } session->lns_mode = !!val; break; case PPPOL2TP_SO_DEBUG: /* Session debug flags option is deprecated */ break; case PPPOL2TP_SO_REORDERTO: session->reorder_timeout = msecs_to_jiffies(val); break; default: err = -ENOPROTOOPT; break; } return err; } /* Main setsockopt() entry point. * Does API checks, then calls either the tunnel or session setsockopt * handler, according to whether the PPPoL2TP socket is a for a regular * session or the special tunnel type. */ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct l2tp_session *session; struct l2tp_tunnel *tunnel; int val; int err; if (level != SOL_PPPOL2TP) return -EINVAL; if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(int))) return -EFAULT; err = -ENOTCONN; if (!sk->sk_user_data) goto end; /* Get session context from the socket */ err = -EBADF; session = pppol2tp_sock_to_session(sk); if (!session) goto end; /* Special case: if session_id == 0x0000, treat as operation on tunnel */ if (session->session_id == 0 && session->peer_session_id == 0) { tunnel = session->tunnel; err = pppol2tp_tunnel_setsockopt(sk, tunnel, optname, val); } else { err = pppol2tp_session_setsockopt(sk, session, optname, val); } l2tp_session_put(session); end: return err; } /* Tunnel getsockopt helper. Called with sock locked. */ static int pppol2tp_tunnel_getsockopt(struct sock *sk, struct l2tp_tunnel *tunnel, int optname, int *val) { int err = 0; switch (optname) { case PPPOL2TP_SO_DEBUG: /* Tunnel debug flags option is deprecated */ *val = 0; break; default: err = -ENOPROTOOPT; break; } return err; } /* Session getsockopt helper. Called with sock locked. */ static int pppol2tp_session_getsockopt(struct sock *sk, struct l2tp_session *session, int optname, int *val) { int err = 0; switch (optname) { case PPPOL2TP_SO_RECVSEQ: *val = session->recv_seq; break; case PPPOL2TP_SO_SENDSEQ: *val = session->send_seq; break; case PPPOL2TP_SO_LNSMODE: *val = session->lns_mode; break; case PPPOL2TP_SO_DEBUG: /* Session debug flags option is deprecated */ *val = 0; break; case PPPOL2TP_SO_REORDERTO: *val = (int)jiffies_to_msecs(session->reorder_timeout); break; default: err = -ENOPROTOOPT; } return err; } /* Main getsockopt() entry point. * Does API checks, then calls either the tunnel or session getsockopt * handler, according to whether the PPPoX socket is a for a regular session * or the special tunnel type. */ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct l2tp_session *session; struct l2tp_tunnel *tunnel; int val, len; int err; if (level != SOL_PPPOL2TP) return -EINVAL; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; len = min_t(unsigned int, len, sizeof(int)); err = -ENOTCONN; if (!sk->sk_user_data) goto end; /* Get the session context */ err = -EBADF; session = pppol2tp_sock_to_session(sk); if (!session) goto end; /* Special case: if session_id == 0x0000, treat as operation on tunnel */ if (session->session_id == 0 && session->peer_session_id == 0) { tunnel = session->tunnel; err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val); if (err) goto end_put_sess; } else { err = pppol2tp_session_getsockopt(sk, session, optname, &val); if (err) goto end_put_sess; } err = -EFAULT; if (put_user(len, optlen)) goto end_put_sess; if (copy_to_user((void __user *)optval, &val, len)) goto end_put_sess; err = 0; end_put_sess: l2tp_session_put(session); end: return err; } /***************************************************************************** * /proc filesystem for debug * Since the original pppol2tp driver provided /proc/net/pppol2tp for * L2TPv2, we dump only L2TPv2 tunnels and sessions here. *****************************************************************************/ #ifdef CONFIG_PROC_FS struct pppol2tp_seq_data { struct seq_net_private p; unsigned long tkey; /* lookup key of current tunnel */ unsigned long skey; /* lookup key of current session */ struct l2tp_tunnel *tunnel; struct l2tp_session *session; /* NULL means get next tunnel */ }; static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd) { /* Drop reference taken during previous invocation */ if (pd->tunnel) l2tp_tunnel_put(pd->tunnel); for (;;) { pd->tunnel = l2tp_tunnel_get_next(net, &pd->tkey); pd->tkey++; /* Only accept L2TPv2 tunnels */ if (!pd->tunnel || pd->tunnel->version == 2) return; l2tp_tunnel_put(pd->tunnel); } } static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd) { /* Drop reference taken during previous invocation */ if (pd->session) l2tp_session_put(pd->session); pd->session = l2tp_session_get_next(net, pd->tunnel->sock, pd->tunnel->version, pd->tunnel->tunnel_id, &pd->skey); pd->skey++; if (!pd->session) { pd->skey = 0; pppol2tp_next_tunnel(net, pd); } } static void *pppol2tp_seq_start(struct seq_file *m, loff_t *offs) { struct pppol2tp_seq_data *pd = SEQ_START_TOKEN; loff_t pos = *offs; struct net *net; if (!pos) goto out; if (WARN_ON(!m->private)) { pd = NULL; goto out; } pd = m->private; net = seq_file_net(m); if (!pd->tunnel) pppol2tp_next_tunnel(net, pd); else pppol2tp_next_session(net, pd); /* NULL tunnel and session indicates end of list */ if (!pd->tunnel && !pd->session) pd = NULL; out: return pd; } static void *pppol2tp_seq_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; return NULL; } static void pppol2tp_seq_stop(struct seq_file *p, void *v) { struct pppol2tp_seq_data *pd = v; if (!pd || pd == SEQ_START_TOKEN) return; /* Drop reference taken by last invocation of pppol2tp_next_session() * or pppol2tp_next_tunnel(). */ if (pd->session) { l2tp_session_put(pd->session); pd->session = NULL; } if (pd->tunnel) { l2tp_tunnel_put(pd->tunnel); pd->tunnel = NULL; } } static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v) { struct l2tp_tunnel *tunnel = v; seq_printf(m, "\nTUNNEL '%s', %c %d\n", tunnel->name, tunnel->sock ? 'Y' : 'N', refcount_read(&tunnel->ref_count) - 1); seq_printf(m, " %08x %ld/%ld/%ld %ld/%ld/%ld\n", 0, atomic_long_read(&tunnel->stats.tx_packets), atomic_long_read(&tunnel->stats.tx_bytes), atomic_long_read(&tunnel->stats.tx_errors), atomic_long_read(&tunnel->stats.rx_packets), atomic_long_read(&tunnel->stats.rx_bytes), atomic_long_read(&tunnel->stats.rx_errors)); } static void pppol2tp_seq_session_show(struct seq_file *m, void *v) { struct l2tp_session *session = v; struct l2tp_tunnel *tunnel = session->tunnel; unsigned char state; char user_data_ok; struct sock *sk; u32 ip = 0; u16 port = 0; if (tunnel->sock) { struct inet_sock *inet = inet_sk(tunnel->sock); ip = ntohl(inet->inet_saddr); port = ntohs(inet->inet_sport); } sk = pppol2tp_session_get_sock(session); if (sk) { state = sk->sk_state; user_data_ok = (session == sk->sk_user_data) ? 'Y' : 'N'; } else { state = 0; user_data_ok = 'N'; } seq_printf(m, " SESSION '%s' %08X/%d %04X/%04X -> %04X/%04X %d %c\n", session->name, ip, port, tunnel->tunnel_id, session->session_id, tunnel->peer_tunnel_id, session->peer_session_id, state, user_data_ok); seq_printf(m, " 0/0/%c/%c/%s %08x %u\n", session->recv_seq ? 'R' : '-', session->send_seq ? 'S' : '-', session->lns_mode ? "LNS" : "LAC", 0, jiffies_to_msecs(session->reorder_timeout)); seq_printf(m, " %u/%u %ld/%ld/%ld %ld/%ld/%ld\n", session->nr, session->ns, atomic_long_read(&session->stats.tx_packets), atomic_long_read(&session->stats.tx_bytes), atomic_long_read(&session->stats.tx_errors), atomic_long_read(&session->stats.rx_packets), atomic_long_read(&session->stats.rx_bytes), atomic_long_read(&session->stats.rx_errors)); if (sk) { struct pppox_sock *po = pppox_sk(sk); seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); sock_put(sk); } } static int pppol2tp_seq_show(struct seq_file *m, void *v) { struct pppol2tp_seq_data *pd = v; /* display header on line 1 */ if (v == SEQ_START_TOKEN) { seq_puts(m, "PPPoL2TP driver info, " PPPOL2TP_DRV_VERSION "\n"); seq_puts(m, "TUNNEL name, user-data-ok session-count\n"); seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); seq_puts(m, " SESSION name, addr/port src-tid/sid dest-tid/sid state user-data-ok\n"); seq_puts(m, " mtu/mru/rcvseq/sendseq/lns debug reorderto\n"); seq_puts(m, " nr/ns tx-pkts/bytes/errs rx-pkts/bytes/errs\n"); goto out; } if (!pd->session) pppol2tp_seq_tunnel_show(m, pd->tunnel); else pppol2tp_seq_session_show(m, pd->session); out: return 0; } static const struct seq_operations pppol2tp_seq_ops = { .start = pppol2tp_seq_start, .next = pppol2tp_seq_next, .stop = pppol2tp_seq_stop, .show = pppol2tp_seq_show, }; #endif /* CONFIG_PROC_FS */ /***************************************************************************** * Network namespace *****************************************************************************/ static __net_init int pppol2tp_init_net(struct net *net) { struct proc_dir_entry *pde; int err = 0; pde = proc_create_net("pppol2tp", 0444, net->proc_net, &pppol2tp_seq_ops, sizeof(struct pppol2tp_seq_data)); if (!pde) { err = -ENOMEM; goto out; } out: return err; } static __net_exit void pppol2tp_exit_net(struct net *net) { remove_proc_entry("pppol2tp", net->proc_net); } static struct pernet_operations pppol2tp_net_ops = { .init = pppol2tp_init_net, .exit = pppol2tp_exit_net, }; /***************************************************************************** * Init and cleanup *****************************************************************************/ static const struct proto_ops pppol2tp_ops = { .family = AF_PPPOX, .owner = THIS_MODULE, .release = pppol2tp_release, .bind = sock_no_bind, .connect = pppol2tp_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppol2tp_getname, .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = pppol2tp_setsockopt, .getsockopt = pppol2tp_getsockopt, .sendmsg = pppol2tp_sendmsg, .recvmsg = pppol2tp_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = pppox_compat_ioctl, #endif }; static const struct pppox_proto pppol2tp_proto = { .create = pppol2tp_create, .ioctl = pppol2tp_ioctl, .owner = THIS_MODULE, }; #ifdef CONFIG_L2TP_V3 static const struct l2tp_nl_cmd_ops pppol2tp_nl_cmd_ops = { .session_create = pppol2tp_session_create, .session_delete = l2tp_session_delete, }; #endif /* CONFIG_L2TP_V3 */ static int __init pppol2tp_init(void) { int err; err = register_pernet_device(&pppol2tp_net_ops); if (err) goto out; err = proto_register(&pppol2tp_sk_proto, 0); if (err) goto out_unregister_pppol2tp_pernet; err = register_pppox_proto(PX_PROTO_OL2TP, &pppol2tp_proto); if (err) goto out_unregister_pppol2tp_proto; #ifdef CONFIG_L2TP_V3 err = l2tp_nl_register_ops(L2TP_PWTYPE_PPP, &pppol2tp_nl_cmd_ops); if (err) goto out_unregister_pppox; #endif pr_info("PPPoL2TP kernel driver, %s\n", PPPOL2TP_DRV_VERSION); out: return err; #ifdef CONFIG_L2TP_V3 out_unregister_pppox: unregister_pppox_proto(PX_PROTO_OL2TP); #endif out_unregister_pppol2tp_proto: proto_unregister(&pppol2tp_sk_proto); out_unregister_pppol2tp_pernet: unregister_pernet_device(&pppol2tp_net_ops); goto out; } static void __exit pppol2tp_exit(void) { #ifdef CONFIG_L2TP_V3 l2tp_nl_unregister_ops(L2TP_PWTYPE_PPP); #endif unregister_pppox_proto(PX_PROTO_OL2TP); proto_unregister(&pppol2tp_sk_proto); unregister_pernet_device(&pppol2tp_net_ops); } module_init(pppol2tp_init); module_exit(pppol2tp_exit); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("PPP over L2TP over UDP"); MODULE_LICENSE("GPL"); MODULE_VERSION(PPPOL2TP_DRV_VERSION); MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OL2TP); MODULE_ALIAS_L2TP_PWTYPE(7); |
932 932 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TEXT_PATCHING_H #define _ASM_X86_TEXT_PATCHING_H #include <linux/types.h> #include <linux/stddef.h> #include <asm/ptrace.h> /* * Currently, the max observed size in the kernel code is * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. * Raise it if needed. */ #define POKE_MAX_OPCODE_SIZE 5 extern void text_poke_early(void *addr, const void *opcode, size_t len); extern void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len); /* * Clear and restore the kernel write-protection flag on the local CPU. * Allows the kernel to edit read-only pages. * Side-effect: any interrupt handler running between save and restore will have * the ability to write to read-only pages. * * Warning: * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and * no thread can be preempted in the instructions being modified (no iret to an * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. * On the local CPU you need to be protected against NMI or MCE handlers seeing * an inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); #define text_poke_copy text_poke_copy extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate); extern void text_poke_finish(void); #define INT3_INSN_SIZE 1 #define INT3_INSN_OPCODE 0xCC #define RET_INSN_SIZE 1 #define RET_INSN_OPCODE 0xC3 #define CALL_INSN_SIZE 5 #define CALL_INSN_OPCODE 0xE8 #define JMP32_INSN_SIZE 5 #define JMP32_INSN_OPCODE 0xE9 #define JMP8_INSN_SIZE 2 #define JMP8_INSN_OPCODE 0xEB #define DISP32_SIZE 4 static __always_inline int text_opcode_size(u8 opcode) { int size = 0; #define __CASE(insn) \ case insn##_INSN_OPCODE: size = insn##_INSN_SIZE; break switch(opcode) { __CASE(INT3); __CASE(RET); __CASE(CALL); __CASE(JMP32); __CASE(JMP8); } #undef __CASE return size; } union text_poke_insn { u8 text[POKE_MAX_OPCODE_SIZE]; struct { u8 opcode; s32 disp; } __attribute__((packed)); }; static __always_inline void __text_gen_insn(void *buf, u8 opcode, const void *addr, const void *dest, int size) { union text_poke_insn *insn = buf; BUG_ON(size < text_opcode_size(opcode)); /* * Hide the addresses to avoid the compiler folding in constants when * referencing code, these can mess up annotations like * ANNOTATE_NOENDBR. */ OPTIMIZER_HIDE_VAR(insn); OPTIMIZER_HIDE_VAR(addr); OPTIMIZER_HIDE_VAR(dest); insn->opcode = opcode; if (size > 1) { insn->disp = (long)dest - (long)(addr + size); if (size == 2) { /* * Ensure that for JMP8 the displacement * actually fits the signed byte. */ BUG_ON((insn->disp >> 31) != (insn->disp >> 7)); } } } static __always_inline void *text_gen_insn(u8 opcode, const void *addr, const void *dest) { static union text_poke_insn insn; /* per instance */ __text_gen_insn(&insn, opcode, addr, dest, text_opcode_size(opcode)); return &insn.text; } extern int after_bootmem; extern __ro_after_init struct mm_struct *poking_mm; extern __ro_after_init unsigned long poking_addr; #ifndef CONFIG_UML_X86 static __always_inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip) { regs->ip = ip; } static __always_inline void int3_emulate_push(struct pt_regs *regs, unsigned long val) { /* * The int3 handler in entry_64.S adds a gap between the * stack where the break point happened, and the saving of * pt_regs. We can extend the original stack because of * this gap. See the idtentry macro's create_gap option. * * Similarly entry_32.S will have a gap on the stack for (any) hardware * exception and pt_regs; see FIXUP_FRAME. */ regs->sp -= sizeof(unsigned long); *(unsigned long *)regs->sp = val; } static __always_inline unsigned long int3_emulate_pop(struct pt_regs *regs) { unsigned long val = *(unsigned long *)regs->sp; regs->sp += sizeof(unsigned long); return val; } static __always_inline void int3_emulate_call(struct pt_regs *regs, unsigned long func) { int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE); int3_emulate_jmp(regs, func); } static __always_inline void int3_emulate_ret(struct pt_regs *regs) { unsigned long ip = int3_emulate_pop(regs); int3_emulate_jmp(regs, ip); } static __always_inline void int3_emulate_jcc(struct pt_regs *regs, u8 cc, unsigned long ip, unsigned long disp) { static const unsigned long jcc_mask[6] = { [0] = X86_EFLAGS_OF, [1] = X86_EFLAGS_CF, [2] = X86_EFLAGS_ZF, [3] = X86_EFLAGS_CF | X86_EFLAGS_ZF, [4] = X86_EFLAGS_SF, [5] = X86_EFLAGS_PF, }; bool invert = cc & 1; bool match; if (cc < 0xc) { match = regs->flags & jcc_mask[cc >> 1]; } else { match = ((regs->flags & X86_EFLAGS_SF) >> X86_EFLAGS_SF_BIT) ^ ((regs->flags & X86_EFLAGS_OF) >> X86_EFLAGS_OF_BIT); if (cc >= 0xe) match = match || (regs->flags & X86_EFLAGS_ZF); } if ((match && !invert) || (!match && invert)) ip += disp; int3_emulate_jmp(regs, ip); } #endif /* !CONFIG_UML_X86 */ #endif /* _ASM_X86_TEXT_PATCHING_H */ |
134 109 6 6 5 5 297 6 8 9 9 9 9 186 186 8 8 8 426 429 11 30 30 373 373 523 521 3 1 1 521 523 463 71 70 78 4 77 451 448 451 439 9 9 449 9 9 439 449 451 120 451 121 439 9 520 523 521 1 524 385 78 447 48 48 40 348 150 450 10 518 522 415 169 523 433 434 120 78 433 62 210 263 1505 1099 1302 1429 1432 1430 1 1207 263 1410 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1994, Karl Keyte: Added support for disk statistics * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> * - July2000 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 */ /* * This handles all read/write requests to block devices */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-pm.h> #include <linux/blk-integrity.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/kernel_stat.h> #include <linux/string.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/task_io_accounting_ops.h> #include <linux/fault-inject.h> #include <linux/list_sort.h> #include <linux/delay.h> #include <linux/ratelimit.h> #include <linux/pm_runtime.h> #include <linux/t10-pi.h> #include <linux/debugfs.h> #include <linux/bpf.h> #include <linux/part_stat.h> #include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" #include "blk-mq-sched.h" #include "blk-pm.h" #include "blk-cgroup.h" #include "blk-throttle.h" #include "blk-ioprio.h" struct dentry *blk_debugfs_root; EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert); static DEFINE_IDA(blk_queue_ida); /* * For queue allocation */ static struct kmem_cache *blk_requestq_cachep; /* * Controlling structure to kblockd */ static struct workqueue_struct *kblockd_workqueue; /** * blk_queue_flag_set - atomically set a queue flag * @flag: flag to be set * @q: request queue */ void blk_queue_flag_set(unsigned int flag, struct request_queue *q) { set_bit(flag, &q->queue_flags); } EXPORT_SYMBOL(blk_queue_flag_set); /** * blk_queue_flag_clear - atomically clear a queue flag * @flag: flag to be cleared * @q: request queue */ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) { clear_bit(flag, &q->queue_flags); } EXPORT_SYMBOL(blk_queue_flag_clear); #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const blk_op_name[] = { REQ_OP_NAME(READ), REQ_OP_NAME(WRITE), REQ_OP_NAME(FLUSH), REQ_OP_NAME(DISCARD), REQ_OP_NAME(SECURE_ERASE), REQ_OP_NAME(ZONE_RESET), REQ_OP_NAME(ZONE_RESET_ALL), REQ_OP_NAME(ZONE_OPEN), REQ_OP_NAME(ZONE_CLOSE), REQ_OP_NAME(ZONE_FINISH), REQ_OP_NAME(ZONE_APPEND), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(DRV_IN), REQ_OP_NAME(DRV_OUT), }; #undef REQ_OP_NAME /** * blk_op_str - Return string XXX in the REQ_OP_XXX. * @op: REQ_OP_XXX. * * Description: Centralize block layer function to convert REQ_OP_XXX into * string format. Useful in the debugging and tracing bio or request. For * invalid REQ_OP_XXX it returns string "UNKNOWN". */ inline const char *blk_op_str(enum req_op op) { const char *op_str = "UNKNOWN"; if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op]) op_str = blk_op_name[op]; return op_str; } EXPORT_SYMBOL_GPL(blk_op_str); static const struct { int errno; const char *name; } blk_errors[] = { [BLK_STS_OK] = { 0, "" }, [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" }, [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" }, [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" }, [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" }, [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" }, [BLK_STS_RESV_CONFLICT] = { -EBADE, "reservation conflict" }, [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" }, [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, [BLK_STS_OFFLINE] = { -ENODEV, "device offline" }, /* device mapper special case, should not leak out: */ [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, /* zone device specific errors */ [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" }, [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" }, /* Command duration limit device-side timeout */ [BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" }, [BLK_STS_INVAL] = { -EINVAL, "invalid" }, /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; blk_status_t errno_to_blk_status(int errno) { int i; for (i = 0; i < ARRAY_SIZE(blk_errors); i++) { if (blk_errors[i].errno == errno) return (__force blk_status_t)i; } return BLK_STS_IOERR; } EXPORT_SYMBOL_GPL(errno_to_blk_status); int blk_status_to_errno(blk_status_t status) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return -EIO; return blk_errors[idx].errno; } EXPORT_SYMBOL_GPL(blk_status_to_errno); const char *blk_status_to_str(blk_status_t status) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return "<null>"; return blk_errors[idx].name; } EXPORT_SYMBOL_GPL(blk_status_to_str); /** * blk_sync_queue - cancel any pending callbacks on a queue * @q: the queue * * Description: * The block layer may perform asynchronous callback activity * on a queue, such as calling the unplug function after a timeout. * A block device may call blk_sync_queue to ensure that any * such activity is cancelled, thus allowing it to release resources * that the callbacks might use. The caller must already have made sure * that its ->submit_bio will not re-add plugging prior to calling * this function. * * This function does not cancel any asynchronous activity arising * out of elevator or throttling code. That would require elevator_exit() * and blkcg_exit_queue() to be called with queue lock initialized. * */ void blk_sync_queue(struct request_queue *q) { del_timer_sync(&q->timeout); cancel_work_sync(&q->timeout_work); } EXPORT_SYMBOL(blk_sync_queue); /** * blk_set_pm_only - increment pm_only counter * @q: request queue pointer */ void blk_set_pm_only(struct request_queue *q) { atomic_inc(&q->pm_only); } EXPORT_SYMBOL_GPL(blk_set_pm_only); void blk_clear_pm_only(struct request_queue *q) { int pm_only; pm_only = atomic_dec_return(&q->pm_only); WARN_ON_ONCE(pm_only < 0); if (pm_only == 0) wake_up_all(&q->mq_freeze_wq); } EXPORT_SYMBOL_GPL(blk_clear_pm_only); static void blk_free_queue_rcu(struct rcu_head *rcu_head) { struct request_queue *q = container_of(rcu_head, struct request_queue, rcu_head); percpu_ref_exit(&q->q_usage_counter); kmem_cache_free(blk_requestq_cachep, q); } static void blk_free_queue(struct request_queue *q) { blk_free_queue_stats(q->stats); if (queue_is_mq(q)) blk_mq_release(q); ida_free(&blk_queue_ida, q->id); lockdep_unregister_key(&q->io_lock_cls_key); lockdep_unregister_key(&q->q_lock_cls_key); call_rcu(&q->rcu_head, blk_free_queue_rcu); } /** * blk_put_queue - decrement the request_queue refcount * @q: the request_queue structure to decrement the refcount for * * Decrements the refcount of the request_queue and free it when the refcount * reaches 0. */ void blk_put_queue(struct request_queue *q) { if (refcount_dec_and_test(&q->refs)) blk_free_queue(q); } EXPORT_SYMBOL(blk_put_queue); bool blk_queue_start_drain(struct request_queue *q) { /* * When queue DYING flag is set, we need to block new req * entering queue, so we call blk_freeze_queue_start() to * prevent I/O from crossing blk_queue_enter(). */ bool freeze = __blk_freeze_queue_start(q, current); if (queue_is_mq(q)) blk_mq_wake_waiters(q); /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); return freeze; } /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM */ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { const bool pm = flags & BLK_MQ_REQ_PM; while (!blk_try_enter_queue(q, pm)) { if (flags & BLK_MQ_REQ_NOWAIT) return -EAGAIN; /* * read pair of barrier in blk_freeze_queue_start(), we need to * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and * reading .mq_freeze_depth or queue dying flag, otherwise the * following wait may never return if the two reads are * reordered. */ smp_rmb(); wait_event(q->mq_freeze_wq, (!q->mq_freeze_depth && blk_pm_resume_queue(pm, q)) || blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; } rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_); rwsem_release(&q->q_lockdep_map, _RET_IP_); return 0; } int __bio_queue_enter(struct request_queue *q, struct bio *bio) { while (!blk_try_enter_queue(q, false)) { struct gendisk *disk = bio->bi_bdev->bd_disk; if (bio->bi_opf & REQ_NOWAIT) { if (test_bit(GD_DEAD, &disk->state)) goto dead; bio_wouldblock_error(bio); return -EAGAIN; } /* * read pair of barrier in blk_freeze_queue_start(), we need to * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and * reading .mq_freeze_depth or queue dying flag, otherwise the * following wait may never return if the two reads are * reordered. */ smp_rmb(); wait_event(q->mq_freeze_wq, (!q->mq_freeze_depth && blk_pm_resume_queue(false, q)) || test_bit(GD_DEAD, &disk->state)); if (test_bit(GD_DEAD, &disk->state)) goto dead; } rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_); rwsem_release(&q->io_lockdep_map, _RET_IP_); return 0; dead: bio_io_error(bio); return -ENODEV; } void blk_queue_exit(struct request_queue *q) { percpu_ref_put(&q->q_usage_counter); } static void blk_queue_usage_counter_release(struct percpu_ref *ref) { struct request_queue *q = container_of(ref, struct request_queue, q_usage_counter); wake_up_all(&q->mq_freeze_wq); } static void blk_rq_timed_out_timer(struct timer_list *t) { struct request_queue *q = from_timer(q, t, timeout); kblockd_schedule_work(&q->timeout_work); } static void blk_timeout_work(struct work_struct *work) { } struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) { struct request_queue *q; int error; q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, node_id); if (!q) return ERR_PTR(-ENOMEM); q->last_merge = NULL; q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); if (q->id < 0) { error = q->id; goto fail_q; } q->stats = blk_alloc_queue_stats(); if (!q->stats) { error = -ENOMEM; goto fail_id; } error = blk_set_default_limits(lim); if (error) goto fail_stats; q->limits = *lim; q->node = node_id; atomic_set(&q->nr_active_requests_shared_tags, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); refcount_set(&q->refs, 1); mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->limits_lock); mutex_init(&q->rq_qos_mutex); spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); mutex_init(&q->mq_freeze_lock); blkg_init_queue(q); /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. */ error = percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL); if (error) goto fail_stats; lockdep_register_key(&q->io_lock_cls_key); lockdep_register_key(&q->q_lock_cls_key); lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)", &q->io_lock_cls_key, 0); lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)", &q->q_lock_cls_key, 0); q->nr_requests = BLKDEV_DEFAULT_RQ; return q; fail_stats: blk_free_queue_stats(q->stats); fail_id: ida_free(&blk_queue_ida, q->id); fail_q: kmem_cache_free(blk_requestq_cachep, q); return ERR_PTR(error); } /** * blk_get_queue - increment the request_queue refcount * @q: the request_queue structure to increment the refcount for * * Increment the refcount of the request_queue kobject. * * Context: Any context. */ bool blk_get_queue(struct request_queue *q) { if (unlikely(blk_queue_dying(q))) return false; refcount_inc(&q->refs); return true; } EXPORT_SYMBOL(blk_get_queue); #ifdef CONFIG_FAIL_MAKE_REQUEST static DECLARE_FAULT_ATTR(fail_make_request); static int __init setup_fail_make_request(char *str) { return setup_fault_attr(&fail_make_request, str); } __setup("fail_make_request=", setup_fail_make_request); bool should_fail_request(struct block_device *part, unsigned int bytes) { return bdev_test_flag(part, BD_MAKE_IT_FAIL) && should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) { struct dentry *dir = fault_create_debugfs_attr("fail_make_request", NULL, &fail_make_request); return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_make_request_debugfs); #endif /* CONFIG_FAIL_MAKE_REQUEST */ static inline void bio_check_ro(struct bio *bio) { if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) return; if (bdev_test_flag(bio->bi_bdev, BD_RO_WARNED)) return; bdev_set_flag(bio->bi_bdev, BD_RO_WARNED); /* * Use ioctl to set underlying disk of raid/dm to read-only * will trigger this. */ pr_warn("Trying to write to read-only block-device %pg\n", bio->bi_bdev); } } static noinline int should_fail_bio(struct bio *bio) { if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size)) return -EIO; return 0; } ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); /* * Check whether this bio extends beyond the end of the device or partition. * This may well happen - the kernel calls bread() without checking the size of * the device, e.g., when mounting a file system. */ static inline int bio_check_eod(struct bio *bio) { sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); unsigned int nr_sectors = bio_sectors(bio); if (nr_sectors && (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { pr_info_ratelimited("%s: attempt to access beyond end of device\n" "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n", current->comm, bio->bi_bdev, bio->bi_opf, bio->bi_iter.bi_sector, nr_sectors, maxsector); return -EIO; } return 0; } /* * Remap block n of partition p to block n+start(p) of the disk. */ static int blk_partition_remap(struct bio *bio) { struct block_device *p = bio->bi_bdev; if (unlikely(should_fail_request(p, bio->bi_iter.bi_size))) return -EIO; if (bio_sectors(bio)) { bio->bi_iter.bi_sector += p->bd_start_sect; trace_block_bio_remap(bio, p->bd_dev, bio->bi_iter.bi_sector - p->bd_start_sect); } bio_set_flag(bio, BIO_REMAPPED); return 0; } /* * Check write append to a zoned block device. */ static inline blk_status_t blk_check_zone_append(struct request_queue *q, struct bio *bio) { int nr_sectors = bio_sectors(bio); /* Only applicable to zoned block devices */ if (!bdev_is_zoned(bio->bi_bdev)) return BLK_STS_NOTSUPP; /* The bio sector must point to the start of a sequential zone */ if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector)) return BLK_STS_IOERR; /* * Not allowed to cross zone boundaries. Otherwise, the BIO will be * split and could result in non-contiguous sectors being written in * different zones. */ if (nr_sectors > q->limits.chunk_sectors) return BLK_STS_IOERR; /* Make sure the BIO is small enough and will not get split */ if (nr_sectors > q->limits.max_zone_append_sectors) return BLK_STS_IOERR; bio->bi_opf |= REQ_NOMERGE; return BLK_STS_OK; } static void __submit_bio(struct bio *bio) { /* If plug is not used, add new plug here to cache nsecs time. */ struct blk_plug plug; if (unlikely(!blk_crypto_bio_prep(&bio))) return; blk_start_plug(&plug); if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) { blk_mq_submit_bio(bio); } else if (likely(bio_queue_enter(bio) == 0)) { struct gendisk *disk = bio->bi_bdev->bd_disk; if ((bio->bi_opf & REQ_POLLED) && !(disk->queue->limits.features & BLK_FEAT_POLL)) { bio->bi_status = BLK_STS_NOTSUPP; bio_endio(bio); } else { disk->fops->submit_bio(bio); } blk_queue_exit(disk->queue); } blk_finish_plug(&plug); } /* * The loop in this function may be a bit non-obvious, and so deserves some * explanation: * * - Before entering the loop, bio->bi_next is NULL (as all callers ensure * that), so we have a list with a single bio. * - We pretend that we have just taken it off a longer list, so we assign * bio_list to a pointer to the bio_list_on_stack, thus initialising the * bio_list of new bios to be added. ->submit_bio() may indeed add some more * bios through a recursive call to submit_bio_noacct. If it did, we find a * non-NULL value in bio_list and re-enter the loop from the top. * - In this case we really did just take the bio of the top of the list (no * pretending) and so remove it from bio_list, and call into ->submit_bio() * again. * * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. * bio_list_on_stack[1] contains bios that were submitted before the current * ->submit_bio, but that haven't been processed yet. */ static void __submit_bio_noacct(struct bio *bio) { struct bio_list bio_list_on_stack[2]; BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack[0]); current->bio_list = bio_list_on_stack; do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct bio_list lower, same; /* * Create a fresh bio_list for all subordinate requests. */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); __submit_bio(bio); /* * Sort new bios into those for a lower level and those for the * same level. */ bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) if (q == bdev_get_queue(bio->bi_bdev)) bio_list_add(&same, bio); else bio_list_add(&lower, bio); /* * Now assemble so we handle the lowest level first. */ bio_list_merge(&bio_list_on_stack[0], &lower); bio_list_merge(&bio_list_on_stack[0], &same); bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); current->bio_list = NULL; } static void __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; current->bio_list = bio_list; do { __submit_bio(bio); } while ((bio = bio_list_pop(&bio_list[0]))); current->bio_list = NULL; } void submit_bio_noacct_nocheck(struct bio *bio) { blk_cgroup_bio_start(bio); blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_queue(bio); /* * Now that enqueuing has been traced, we need to trace * completion as well. */ bio_set_flag(bio, BIO_TRACE_COMPLETION); } /* * We only want one ->submit_bio to be active at a time, else stack * usage with stacked devices could be a problem. Use current->bio_list * to collect a list of requests submited by a ->submit_bio method while * it is active, and then process them after it returned. */ if (current->bio_list) bio_list_add(¤t->bio_list[0], bio); else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) __submit_bio_noacct_mq(bio); else __submit_bio_noacct(bio); } static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q, struct bio *bio) { if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q)) return BLK_STS_INVAL; if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q)) return BLK_STS_INVAL; return BLK_STS_OK; } /** * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. * * This is a version of submit_bio() that shall only be used for I/O that is * resubmitted to lower level drivers by stacking block drivers. All file * systems and other upper level users of the block layer should use * submit_bio() instead. */ void submit_bio_noacct(struct bio *bio) { struct block_device *bdev = bio->bi_bdev; struct request_queue *q = bdev_get_queue(bdev); blk_status_t status = BLK_STS_IOERR; might_sleep(); /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue does not support NOWAIT. */ if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev)) goto not_supported; if (should_fail_bio(bio)) goto end_io; bio_check_ro(bio); if (!bio_flagged(bio, BIO_REMAPPED)) { if (unlikely(bio_check_eod(bio))) goto end_io; if (bdev_is_partition(bdev) && unlikely(blk_partition_remap(bio))) goto end_io; } /* * Filter flush bio's early so that bio based drivers without flush * support don't have to worry about them. */ if (op_is_flush(bio->bi_opf)) { if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE && bio_op(bio) != REQ_OP_ZONE_APPEND)) goto end_io; if (!bdev_write_cache(bdev)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!bio_sectors(bio)) { status = BLK_STS_OK; goto end_io; } } } switch (bio_op(bio)) { case REQ_OP_READ: break; case REQ_OP_WRITE: if (bio->bi_opf & REQ_ATOMIC) { status = blk_validate_atomic_write_op_size(q, bio); if (status != BLK_STS_OK) goto end_io; } break; case REQ_OP_FLUSH: /* * REQ_OP_FLUSH can't be submitted through bios, it is only * synthetized in struct request by the flush state machine. */ goto not_supported; case REQ_OP_DISCARD: if (!bdev_max_discard_sectors(bdev)) goto not_supported; break; case REQ_OP_SECURE_ERASE: if (!bdev_max_secure_erase_sectors(bdev)) goto not_supported; break; case REQ_OP_ZONE_APPEND: status = blk_check_zone_append(q, bio); if (status != BLK_STS_OK) goto end_io; break; case REQ_OP_WRITE_ZEROES: if (!q->limits.max_write_zeroes_sectors) goto not_supported; break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: case REQ_OP_ZONE_RESET_ALL: if (!bdev_is_zoned(bio->bi_bdev)) goto not_supported; break; case REQ_OP_DRV_IN: case REQ_OP_DRV_OUT: /* * Driver private operations are only used with passthrough * requests. */ fallthrough; default: goto not_supported; } if (blk_throtl_bio(bio)) return; submit_bio_noacct_nocheck(bio); return; not_supported: status = BLK_STS_NOTSUPP; end_io: bio->bi_status = status; bio_endio(bio); } EXPORT_SYMBOL(submit_bio_noacct); static void bio_set_ioprio(struct bio *bio) { /* Nobody set ioprio so far? Initialize it based on task's nice value */ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE) bio->bi_ioprio = get_current_ioprio(); blkcg_set_ioprio(bio); } /** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O * * submit_bio() is used to submit I/O requests to block devices. It is passed a * fully set up &struct bio that describes the I/O that needs to be done. The * bio will be send to the device described by the bi_bdev field. * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback * in @bio. The bio must NOT be touched by the caller until ->bi_end_io() has * been called. */ void submit_bio(struct bio *bio) { if (bio_op(bio) == REQ_OP_READ) { task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, bio_sectors(bio)); } else if (bio_op(bio) == REQ_OP_WRITE) { count_vm_events(PGPGOUT, bio_sectors(bio)); } bio_set_ioprio(bio); submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); /** * bio_poll - poll for BIO completions * @bio: bio to poll for * @iob: batches of IO * @flags: BLK_POLL_* flags that control the behavior * * Poll for completions on queue associated with the bio. Returns number of * completed entries found. * * Note: the caller must either be the context that submitted @bio, or * be in a RCU critical section to prevent freeing of @bio. */ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { blk_qc_t cookie = READ_ONCE(bio->bi_cookie); struct block_device *bdev; struct request_queue *q; int ret = 0; bdev = READ_ONCE(bio->bi_bdev); if (!bdev) return 0; q = bdev_get_queue(bdev); if (cookie == BLK_QC_T_NONE) return 0; blk_flush_plug(current->plug, false); /* * We need to be able to enter a frozen queue, similar to how * timeouts also need to do that. If that is blocked, then we can * have pending IO when a queue freeze is started, and then the * wait for the freeze to finish will wait for polled requests to * timeout as the poller is preventer from entering the queue and * completing them. As long as we prevent new IO from being queued, * that should be all that matters. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return 0; if (queue_is_mq(q)) { ret = blk_mq_poll(q, cookie, iob, flags); } else { struct gendisk *disk = q->disk; if ((q->limits.features & BLK_FEAT_POLL) && disk && disk->fops->poll_bio) ret = disk->fops->poll_bio(bio, iob, flags); } blk_queue_exit(q); return ret; } EXPORT_SYMBOL_GPL(bio_poll); /* * Helper to implement file_operations.iopoll. Requires the bio to be stored * in iocb->private, and cleared before freeing the bio. */ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, unsigned int flags) { struct bio *bio; int ret = 0; /* * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can * point to a freshly allocated bio at this point. If that happens * we have a few cases to consider: * * 1) the bio is beeing initialized and bi_bdev is NULL. We can just * simply nothing in this case * 2) the bio points to a not poll enabled device. bio_poll will catch * this and return 0 * 3) the bio points to a poll capable device, including but not * limited to the one that the original bio pointed to. In this * case we will call into the actual poll method and poll for I/O, * even if we don't need to, but it won't cause harm either. * * For cases 2) and 3) above the RCU grace period ensures that bi_bdev * is still allocated. Because partitions hold a reference to the whole * device bdev and thus disk, the disk is also still valid. Grabbing * a reference to the queue in bio_poll() ensures the hctxs and requests * are still valid as well. */ rcu_read_lock(); bio = READ_ONCE(kiocb->private); if (bio) ret = bio_poll(bio, iob, flags); rcu_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(iocb_bio_iopoll); void update_io_ticks(struct block_device *part, unsigned long now, bool end) { unsigned long stamp; again: stamp = READ_ONCE(part->bd_stamp); if (unlikely(time_after(now, stamp)) && likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) && (end || part_in_flight(part))) __part_stat_add(part, io_ticks, now - stamp); if (bdev_is_partition(part)) { part = bdev_whole(part); goto again; } } unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time) { part_stat_lock(); update_io_ticks(bdev, start_time, false); part_stat_local_inc(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); return start_time; } EXPORT_SYMBOL(bdev_start_io_acct); /** * bio_start_io_acct - start I/O accounting for bio based drivers * @bio: bio to start account for * * Returns the start time that should be passed back to bio_end_io_acct(). */ unsigned long bio_start_io_acct(struct bio *bio) { return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies); } EXPORT_SYMBOL_GPL(bio_start_io_acct); void bdev_end_io_acct(struct block_device *bdev, enum req_op op, unsigned int sectors, unsigned long start_time) { const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); unsigned long duration = now - start_time; part_stat_lock(); update_io_ticks(bdev, now, true); part_stat_inc(bdev, ios[sgrp]); part_stat_add(bdev, sectors[sgrp], sectors); part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration)); part_stat_local_dec(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); } EXPORT_SYMBOL(bdev_end_io_acct); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, struct block_device *orig_bdev) { bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time); } EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped); /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked * * Description: * Check if underlying low-level drivers of a device are busy. * If the drivers want to export their busy state, they must set own * exporting function using blk_queue_lld_busy() first. * * Basically, this function is used only by request stacking drivers * to stop dispatching requests to underlying devices when underlying * devices are busy. This behavior helps more I/O merging on the queue * of the request stacking driver and prevents I/O throughput regression * on burst I/O load. * * Return: * 0 - Not busy (The request stacking driver should dispatch request) * 1 - Busy (The request stacking driver should stop dispatching request) */ int blk_lld_busy(struct request_queue *q) { if (queue_is_mq(q) && q->mq_ops->busy) return q->mq_ops->busy(q); return 0; } EXPORT_SYMBOL_GPL(blk_lld_busy); int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); } EXPORT_SYMBOL(kblockd_mod_delayed_work_on); void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) { struct task_struct *tsk = current; /* * If this is a nested plug, don't actually assign it. */ if (tsk->plug) return; plug->cur_ktime = 0; rq_list_init(&plug->mq_list); rq_list_init(&plug->cached_rqs); plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); plug->rq_count = 0; plug->multiple_queues = false; plug->has_elevator = false; INIT_LIST_HEAD(&plug->cb_list); /* * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier */ tsk->plug = plug; } /** * blk_start_plug - initialize blk_plug and track it inside the task_struct * @plug: The &struct blk_plug that needs to be initialized * * Description: * blk_start_plug() indicates to the block layer an intent by the caller * to submit multiple I/O requests in a batch. The block layer may use * this hint to defer submitting I/Os from the caller until blk_finish_plug() * is called. However, the block layer may choose to submit requests * before a call to blk_finish_plug() if the number of queued I/Os * exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than * %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if * the task schedules (see below). * * Tracking blk_plug inside the task_struct will help with auto-flushing the * pending I/O should the task end up blocking between blk_start_plug() and * blk_finish_plug(). This is important from a performance perspective, but * also ensures that we don't deadlock. For instance, if the task is blocking * for a memory allocation, memory reclaim could end up wanting to free a * page belonging to that request that is currently residing in our private * plug. By flushing the pending I/O when the process goes to sleep, we avoid * this kind of deadlock. */ void blk_start_plug(struct blk_plug *plug) { blk_start_plug_nr_ios(plug, 1); } EXPORT_SYMBOL(blk_start_plug); static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) { LIST_HEAD(callbacks); while (!list_empty(&plug->cb_list)) { list_splice_init(&plug->cb_list, &callbacks); while (!list_empty(&callbacks)) { struct blk_plug_cb *cb = list_first_entry(&callbacks, struct blk_plug_cb, list); list_del(&cb->list); cb->callback(cb, from_schedule); } } } struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, int size) { struct blk_plug *plug = current->plug; struct blk_plug_cb *cb; if (!plug) return NULL; list_for_each_entry(cb, &plug->cb_list, list) if (cb->callback == unplug && cb->data == data) return cb; /* Not currently on the callback list */ BUG_ON(size < sizeof(*cb)); cb = kzalloc(size, GFP_ATOMIC); if (cb) { cb->data = data; cb->callback = unplug; list_add(&cb->list, &plug->cb_list); } return cb; } EXPORT_SYMBOL(blk_check_plugged); void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) { if (!list_empty(&plug->cb_list)) flush_plug_callbacks(plug, from_schedule); blk_mq_flush_plug_list(plug, from_schedule); /* * Unconditionally flush out cached requests, even if the unplug * event came from schedule. Since we know hold references to the * queue for cached requests, we don't want a blocked task holding * up a queue freeze/quiesce event. */ if (unlikely(!rq_list_empty(&plug->cached_rqs))) blk_mq_free_plug_rqs(plug); plug->cur_ktime = 0; current->flags &= ~PF_BLOCK_TS; } /** * blk_finish_plug - mark the end of a batch of submitted I/O * @plug: The &struct blk_plug passed to blk_start_plug() * * Description: * Indicate that a batch of I/O submissions is complete. This function * must be paired with an initial call to blk_start_plug(). The intent * is to allow the block layer to optimize I/O submission. See the * documentation for blk_start_plug() for more information. */ void blk_finish_plug(struct blk_plug *plug) { if (plug == current->plug) { __blk_flush_plug(plug, false); current->plug = NULL; } } EXPORT_SYMBOL(blk_finish_plug); void blk_io_schedule(void) { /* Prevent hang_check timer from firing at us during very long I/O */ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; if (timeout) io_schedule_timeout(timeout); else io_schedule(); } EXPORT_SYMBOL_GPL(blk_io_schedule); int __init blk_dev_init(void) { BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct bio, bi_opf)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC); blk_debugfs_root = debugfs_create_dir("block", NULL); return 0; } |
4 2 7 9 2 3 5 1 22 16 1 3 2 16 7 9 22 4 17 1 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 | // SPDX-License-Identifier: GPL-2.0-or-later /* * HID driver for Kye/Genius devices not fully compliant with HID standard * * Copyright (c) 2009 Jiri Kosina * Copyright (c) 2009 Tomas Hanak * Copyright (c) 2012 Nikolai Kondrashov * Copyright (c) 2023 David Yang */ #include <linux/unaligned.h> #include <linux/device.h> #include <linux/hid.h> #include <linux/module.h> #include "hid-ids.h" /* Data gathered from Database/VID0458_PID????/Vista/TBoard/default.xml in ioTablet driver * * TODO: * - Add battery and sleep support for EasyPen M406W and MousePen M508WX * - Investigate ScrollZ.MiceFMT buttons of EasyPen M406 */ static const __u8 easypen_m406_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x0A, 0x45, 0x02, /* Usage (AC Rotate), */ 0x09, 0x40, /* Usage (Menu), */ 0x0A, 0x2F, 0x02, /* Usage (AC Zoom), */ 0x0A, 0x46, 0x02, /* Usage (AC Resize), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x0A, 0x6A, 0x02, /* Usage (AC Delete), */ 0x0A, 0x24, 0x02, /* Usage (AC Back), */ 0x0A, 0x25, 0x02, /* Usage (AC Forward), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x08, /* Report Count (8), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x30, /* Report Count (48), */ 0x81, 0x01, /* Input (Constant), */ 0xC0 /* End Collection */ }; static const __u8 easypen_m506_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x0A, 0x6A, 0x02, /* Usage (AC Delete), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x0A, 0x2D, 0x02, /* Usage (AC Zoom In), */ 0x0A, 0x2E, 0x02, /* Usage (AC Zoom Out), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x34, /* Report Count (52), */ 0x81, 0x01, /* Input (Constant), */ 0xC0 /* End Collection */ }; static const __u8 easypen_m406w_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x0A, 0x6A, 0x02, /* Usage (AC Delete), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x0A, 0x01, 0x02, /* Usage (AC New), */ 0x09, 0x40, /* Usage (Menu), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x34, /* Report Count (52), */ 0x81, 0x01, /* Input (Constant), */ 0xC0 /* End Collection */ }; static const __u8 easypen_m610x_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x0A, 0x79, 0x02, /* Usage (AC Redo Or Repeat), */ 0x0A, 0x2D, 0x02, /* Usage (AC Zoom In), */ 0x0A, 0x2E, 0x02, /* Usage (AC Zoom Out), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x34, /* Report Count (52), */ 0x81, 0x01, /* Input (Constant), */ 0xC0 /* End Collection */ }; static const __u8 pensketch_m912_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x08, /* Report Count (8), */ 0x05, 0x0C, /* Usage Page (Consumer), */ 0x0A, 0x6A, 0x02, /* Usage (AC Delete), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x0A, 0x01, 0x02, /* Usage (AC New), */ 0x0A, 0x2F, 0x02, /* Usage (AC Zoom), */ 0x0A, 0x25, 0x02, /* Usage (AC Forward), */ 0x0A, 0x24, 0x02, /* Usage (AC Back), */ 0x0A, 0x2D, 0x02, /* Usage (AC Zoom In), */ 0x0A, 0x2E, 0x02, /* Usage (AC Zoom Out), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x30, /* Report Count (48), */ 0x81, 0x03, /* Input (Constant, Variable), */ 0xC0 /* End Collection */ }; static const __u8 mousepen_m508wx_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x0A, 0x6A, 0x02, /* Usage (AC Delete), */ 0x0A, 0x2D, 0x02, /* Usage (AC Zoom In), */ 0x0A, 0x2E, 0x02, /* Usage (AC Zoom Out), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x34, /* Report Count (52), */ 0x81, 0x01, /* Input (Constant), */ 0xC0 /* End Collection */ }; static const __u8 mousepen_m508x_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x0A, 0x01, 0x02, /* Usage (AC New), */ 0x09, 0x40, /* Usage (Menu), */ 0x0A, 0x6A, 0x02, /* Usage (AC Delete), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x02, /* Input (Variable), */ 0x81, 0x01, /* Input (Constant), */ 0x15, 0xFF, /* Logical Minimum (-1), */ 0x95, 0x10, /* Report Count (16), */ 0x81, 0x01, /* Input (Constant), */ 0x0A, 0x35, 0x02, /* Usage (AC Scroll), */ 0x0A, 0x2F, 0x02, /* Usage (AC Zoom), */ 0x0A, 0x38, 0x02, /* Usage (AC Pan), */ 0x75, 0x08, /* Report Size (8), */ 0x95, 0x03, /* Report Count (3), */ 0x81, 0x06, /* Input (Variable, Relative), */ 0x95, 0x01, /* Report Count (1), */ 0x81, 0x01, /* Input (Constant), */ 0xC0 /* End Collection */ }; static const __u8 easypen_m406xe_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x04, /* Report Count (4), */ 0x0A, 0x79, 0x02, /* Usage (AC Redo Or Repeat), */ 0x0A, 0x1A, 0x02, /* Usage (AC Undo), */ 0x0A, 0x2D, 0x02, /* Usage (AC Zoom In), */ 0x0A, 0x2E, 0x02, /* Usage (AC Zoom Out), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x34, /* Report Count (52), */ 0x81, 0x03, /* Input (Constant, Variable), */ 0xC0 /* End Collection */ }; static const __u8 pensketch_t609a_control_rdesc[] = { 0x05, 0x0C, /* Usage Page (Consumer), */ 0x09, 0x01, /* Usage (Consumer Control), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x12, /* Report ID (18), */ 0x0A, 0x6A, 0x02, /* Usage (AC Delete), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x08, /* Report Count (8), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x37, /* Report Count (55), */ 0x81, 0x01, /* Input (Constant), */ 0xC0 /* End Collection */ }; /* Fix indexes in kye_tablet_fixup() if you change this */ static const __u8 kye_tablet_rdesc[] = { 0x06, 0x00, 0xFF, /* Usage Page (FF00h), */ 0x09, 0x01, /* Usage (01h), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x05, /* Report ID (5), */ 0x09, 0x01, /* Usage (01h), */ 0x15, 0x81, /* Logical Minimum (-127), */ 0x25, 0x7F, /* Logical Maximum (127), */ 0x75, 0x08, /* Report Size (8), */ 0x95, 0x07, /* Report Count (7), */ 0xB1, 0x02, /* Feature (Variable), */ 0xC0, /* End Collection, */ 0x05, 0x0D, /* Usage Page (Digitizer), */ 0x09, 0x01, /* Usage (Digitizer), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x10, /* Report ID (16), */ 0x09, 0x20, /* Usage (Stylus), */ 0xA0, /* Collection (Physical), */ 0x09, 0x42, /* Usage (Tip Switch), */ 0x09, 0x44, /* Usage (Barrel Switch), */ 0x09, 0x46, /* Usage (Tablet Pick), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x03, /* Report Count (3), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x01, /* Input (Constant), */ 0x09, 0x32, /* Usage (In Range), */ 0x95, 0x01, /* Report Count (1), */ 0x81, 0x02, /* Input (Variable), */ 0x75, 0x10, /* Report Size (16), */ 0xA4, /* Push, */ 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x30, /* Usage (X), */ 0x27, 0xFF, 0x7F, 0x00, 0x00, /* Logical Maximum (32767), */ 0x34, /* Physical Minimum (0), */ 0x47, 0x00, 0x00, 0x00, 0x00, /* Physical Maximum (0), */ 0x65, 0x11, /* Unit (Centimeter), */ 0x55, 0x00, /* Unit Exponent (0), */ 0x75, 0x10, /* Report Size (16), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x31, /* Usage (Y), */ 0x27, 0xFF, 0x7F, 0x00, 0x00, /* Logical Maximum (32767), */ 0x47, 0x00, 0x00, 0x00, 0x00, /* Physical Maximum (0), */ 0x81, 0x02, /* Input (Variable), */ 0xB4, /* Pop, */ 0x05, 0x0D, /* Usage Page (Digitizer), */ 0x09, 0x30, /* Usage (Tip Pressure), */ 0x27, 0xFF, 0x07, 0x00, 0x00, /* Logical Maximum (2047), */ 0x81, 0x02, /* Input (Variable), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection, */ }; /* Fix indexes in kye_tablet_fixup() if you change this */ static const __u8 kye_tablet_mouse_rdesc[] = { 0x05, 0x01, /* Usage Page (Desktop), */ 0x09, 0x02, /* Usage (Mouse), */ 0xA1, 0x01, /* Collection (Application), */ 0x85, 0x11, /* Report ID (17), */ 0x09, 0x01, /* Usage (Pointer), */ 0xA0, /* Collection (Physical), */ 0x05, 0x09, /* Usage Page (Button), */ 0x19, 0x01, /* Usage Minimum (01h), */ 0x29, 0x03, /* Usage Maximum (03h), */ 0x14, /* Logical Minimum (0), */ 0x25, 0x01, /* Logical Maximum (1), */ 0x75, 0x01, /* Report Size (1), */ 0x95, 0x03, /* Report Count (3), */ 0x81, 0x02, /* Input (Variable), */ 0x95, 0x04, /* Report Count (4), */ 0x81, 0x01, /* Input (Constant), */ 0x05, 0x0D, /* Usage Page (Digitizer), */ 0x09, 0x37, /* Usage (Data Valid), */ 0x95, 0x01, /* Report Count (1), */ 0x81, 0x02, /* Input (Variable), */ 0x05, 0x01, /* Usage Page (Desktop), */ 0xA4, /* Push, */ 0x09, 0x30, /* Usage (X), */ 0x27, 0xFF, 0x7F, 0x00, 0x00, /* Logical Maximum (32767), */ 0x34, /* Physical Minimum (0), */ 0x47, 0x00, 0x00, 0x00, 0x00, /* Physical Maximum (0), */ 0x65, 0x11, /* Unit (Centimeter), */ 0x55, 0x00, /* Unit Exponent (0), */ 0x75, 0x10, /* Report Size (16), */ 0x81, 0x02, /* Input (Variable), */ 0x09, 0x31, /* Usage (Y), */ 0x27, 0xFF, 0x7F, 0x00, 0x00, /* Logical Maximum (32767), */ 0x47, 0x00, 0x00, 0x00, 0x00, /* Physical Maximum (0), */ 0x81, 0x02, /* Input (Variable), */ 0xB4, /* Pop, */ 0x09, 0x38, /* Usage (Wheel), */ 0x15, 0xFF, /* Logical Minimum (-1), */ 0x75, 0x08, /* Report Size (8), */ 0x95, 0x01, /* Report Count (1), */ 0x81, 0x06, /* Input (Variable, Relative), */ 0x81, 0x01, /* Input (Constant), */ 0xC0, /* End Collection, */ 0xC0 /* End Collection */ }; static const struct kye_tablet_info { __u32 product; __s32 x_logical_maximum; __s32 y_logical_maximum; __s32 pressure_logical_maximum; __s32 x_physical_maximum; __s32 y_physical_maximum; __s8 unit_exponent; __s8 unit; bool has_mouse; unsigned int control_rsize; const __u8 *control_rdesc; } kye_tablets_info[] = { {USB_DEVICE_ID_KYE_EASYPEN_M406, /* 0x5005 */ 15360, 10240, 1023, 6, 4, 0, 0x13, false, sizeof(easypen_m406_control_rdesc), easypen_m406_control_rdesc}, {USB_DEVICE_ID_KYE_EASYPEN_M506, /* 0x500F */ 24576, 20480, 1023, 6, 5, 0, 0x13, false, sizeof(easypen_m506_control_rdesc), easypen_m506_control_rdesc}, {USB_DEVICE_ID_KYE_EASYPEN_I405X, /* 0x5010 */ 14080, 10240, 1023, 55, 40, -1, 0x13, false}, {USB_DEVICE_ID_KYE_MOUSEPEN_I608X, /* 0x5011 */ 20480, 15360, 2047, 8, 6, 0, 0x13, true}, {USB_DEVICE_ID_KYE_EASYPEN_M406W, /* 0x5012 */ 15360, 10240, 1023, 6, 4, 0, 0x13, false, sizeof(easypen_m406w_control_rdesc), easypen_m406w_control_rdesc}, {USB_DEVICE_ID_KYE_EASYPEN_M610X, /* 0x5013 */ 40960, 25600, 1023, 1000, 625, -2, 0x13, false, sizeof(easypen_m610x_control_rdesc), easypen_m610x_control_rdesc}, {USB_DEVICE_ID_KYE_EASYPEN_340, /* 0x5014 */ 10240, 7680, 1023, 4, 3, 0, 0x13, false}, {USB_DEVICE_ID_KYE_PENSKETCH_M912, /* 0x5015 */ 61440, 46080, 2047, 12, 9, 0, 0x13, true, sizeof(pensketch_m912_control_rdesc), pensketch_m912_control_rdesc}, {USB_DEVICE_ID_KYE_MOUSEPEN_M508WX, /* 0x5016 */ 40960, 25600, 2047, 8, 5, 0, 0x13, true, sizeof(mousepen_m508wx_control_rdesc), mousepen_m508wx_control_rdesc}, {USB_DEVICE_ID_KYE_MOUSEPEN_M508X, /* 0x5017 */ 40960, 25600, 2047, 8, 5, 0, 0x13, true, sizeof(mousepen_m508x_control_rdesc), mousepen_m508x_control_rdesc}, {USB_DEVICE_ID_KYE_EASYPEN_M406XE, /* 0x5019 */ 15360, 10240, 1023, 6, 4, 0, 0x13, false, sizeof(easypen_m406xe_control_rdesc), easypen_m406xe_control_rdesc}, {USB_DEVICE_ID_KYE_MOUSEPEN_I608X_V2, /* 0x501A */ 40960, 30720, 2047, 8, 6, 0, 0x13, true}, {USB_DEVICE_ID_KYE_PENSKETCH_T609A, /* 0x501B */ 43520, 28160, 1023, 85, 55, -1, 0x13, false, sizeof(pensketch_t609a_control_rdesc), pensketch_t609a_control_rdesc}, {} }; static __u8 *kye_consumer_control_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize, int offset, const char *device_name) { /* * the fixup that need to be done: * - change Usage Maximum in the Consumer Control * (report ID 3) to a reasonable value */ if (*rsize >= offset + 31 && /* Usage Page (Consumer Devices) */ rdesc[offset] == 0x05 && rdesc[offset + 1] == 0x0c && /* Usage (Consumer Control) */ rdesc[offset + 2] == 0x09 && rdesc[offset + 3] == 0x01 && /* Usage Maximum > 12287 */ rdesc[offset + 10] == 0x2a && rdesc[offset + 12] > 0x2f) { hid_info(hdev, "fixing up %s report descriptor\n", device_name); rdesc[offset + 12] = 0x2f; } return rdesc; } /* * Fix tablet descriptor of so-called "DataFormat 2". * * Though we may achieve a usable descriptor from original vendor-defined one, * some problems exist: * - Their Logical Maximum never exceed 32767 (7F FF), though device do report * values greater than that; * - Physical Maximums are arbitrarily filled (always equal to Logical * Maximum); * - Detail for control buttons are not provided (a vendor-defined Usage Page * with fixed content). * * Thus we use a pre-defined parameter table rather than digging it from * original descriptor. * * We may as well write a fallback routine for unrecognized kye tablet, but it's * clear kye are unlikely to produce new models in the foreseeable future, so we * simply enumerate all possible models. */ static __u8 *kye_tablet_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { const struct kye_tablet_info *info; __u8 *newdesc = rdesc; if (*rsize < sizeof(kye_tablet_rdesc)) { hid_warn(hdev, "tablet report size too small, or kye_tablet_rdesc unexpectedly large\n"); return rdesc; } for (info = kye_tablets_info; info->product; info++) { if (hdev->product == info->product) break; } if (!info->product) { hid_err(hdev, "tablet unknown, someone forget to add kye_tablet_info entry?\n"); return rdesc; } memcpy(newdesc, kye_tablet_rdesc, sizeof(kye_tablet_rdesc)); put_unaligned_le32(info->x_logical_maximum, newdesc + 66); put_unaligned_le32(info->x_physical_maximum, newdesc + 72); newdesc[77] = info->unit; newdesc[79] = info->unit_exponent; put_unaligned_le32(info->y_logical_maximum, newdesc + 87); put_unaligned_le32(info->y_physical_maximum, newdesc + 92); put_unaligned_le32(info->pressure_logical_maximum, newdesc + 104); newdesc += sizeof(kye_tablet_rdesc); if (info->has_mouse) { if (newdesc + sizeof(kye_tablet_mouse_rdesc) > rdesc + *rsize) hid_err(hdev, "control desc unexpectedly large\n"); else { memcpy(newdesc, kye_tablet_mouse_rdesc, sizeof(kye_tablet_mouse_rdesc)); put_unaligned_le32(info->x_logical_maximum, newdesc + 44); put_unaligned_le32(info->x_physical_maximum, newdesc + 50); newdesc[55] = info->unit; newdesc[57] = info->unit_exponent; put_unaligned_le32(info->y_logical_maximum, newdesc + 65); put_unaligned_le32(info->y_physical_maximum, newdesc + 70); newdesc += sizeof(kye_tablet_mouse_rdesc); } } if (info->control_rsize) { if (newdesc + info->control_rsize > rdesc + *rsize) hid_err(hdev, "control desc unexpectedly large\n"); else { memcpy(newdesc, info->control_rdesc, info->control_rsize); newdesc += info->control_rsize; } } *rsize = newdesc - rdesc; return rdesc; } static const __u8 *kye_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { switch (hdev->product) { case USB_DEVICE_ID_KYE_ERGO_525V: /* the fixups that need to be done: * - change led usage page to button for extra buttons * - report size 8 count 1 must be size 1 count 8 for button * bitfield * - change the button usage range to 4-7 for the extra * buttons */ if (*rsize >= 75 && rdesc[61] == 0x05 && rdesc[62] == 0x08 && rdesc[63] == 0x19 && rdesc[64] == 0x08 && rdesc[65] == 0x29 && rdesc[66] == 0x0f && rdesc[71] == 0x75 && rdesc[72] == 0x08 && rdesc[73] == 0x95 && rdesc[74] == 0x01) { hid_info(hdev, "fixing up Kye/Genius Ergo Mouse " "report descriptor\n"); rdesc[62] = 0x09; rdesc[64] = 0x04; rdesc[66] = 0x07; rdesc[72] = 0x01; rdesc[74] = 0x08; } break; case USB_DEVICE_ID_GENIUS_GILA_GAMING_MOUSE: rdesc = kye_consumer_control_fixup(hdev, rdesc, rsize, 104, "Genius Gila Gaming Mouse"); break; case USB_DEVICE_ID_GENIUS_MANTICORE: rdesc = kye_consumer_control_fixup(hdev, rdesc, rsize, 104, "Genius Manticore Keyboard"); break; case USB_DEVICE_ID_GENIUS_GX_IMPERATOR: rdesc = kye_consumer_control_fixup(hdev, rdesc, rsize, 83, "Genius Gx Imperator Keyboard"); break; case USB_DEVICE_ID_KYE_EASYPEN_M406: case USB_DEVICE_ID_KYE_EASYPEN_M506: case USB_DEVICE_ID_KYE_EASYPEN_I405X: case USB_DEVICE_ID_KYE_MOUSEPEN_I608X: case USB_DEVICE_ID_KYE_EASYPEN_M406W: case USB_DEVICE_ID_KYE_EASYPEN_M610X: case USB_DEVICE_ID_KYE_EASYPEN_340: case USB_DEVICE_ID_KYE_PENSKETCH_M912: case USB_DEVICE_ID_KYE_MOUSEPEN_M508WX: case USB_DEVICE_ID_KYE_MOUSEPEN_M508X: case USB_DEVICE_ID_KYE_EASYPEN_M406XE: case USB_DEVICE_ID_KYE_MOUSEPEN_I608X_V2: case USB_DEVICE_ID_KYE_PENSKETCH_T609A: rdesc = kye_tablet_fixup(hdev, rdesc, rsize); break; } return rdesc; } static int kye_tablet_enable(struct hid_device *hdev) { struct list_head *list; struct list_head *head; struct hid_report *report; __s32 *value; list = &hdev->report_enum[HID_FEATURE_REPORT].report_list; list_for_each(head, list) { report = list_entry(head, struct hid_report, list); if (report->id == 5) break; } if (head == list) { hid_err(hdev, "tablet-enabling feature report not found\n"); return -ENODEV; } if (report->maxfield < 1 || report->field[0]->report_count < 7) { hid_err(hdev, "invalid tablet-enabling feature report\n"); return -ENODEV; } value = report->field[0]->value; /* * The code is for DataFormat 2 of config xml. They have no obvious * meaning (at least not configurable in Windows driver) except enabling * fully-functional tablet mode (absolute positioning). Otherwise, the * tablet acts like a relative mouse. * * Though there're magic codes for DataFormat 3 and 4, no devices use * these DataFormats. */ value[0] = 0x12; value[1] = 0x10; value[2] = 0x11; value[3] = 0x12; value[4] = 0x00; value[5] = 0x00; value[6] = 0x00; hid_hw_request(hdev, report, HID_REQ_SET_REPORT); return 0; } static int kye_probe(struct hid_device *hdev, const struct hid_device_id *id) { int ret; ret = hid_parse(hdev); if (ret) { hid_err(hdev, "parse failed\n"); goto err; } ret = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (ret) { hid_err(hdev, "hw start failed\n"); goto err; } switch (id->product) { case USB_DEVICE_ID_GENIUS_MANTICORE: /* * The manticore keyboard needs to have all the interfaces * opened at least once to be fully functional. */ if (hid_hw_open(hdev)) hid_hw_close(hdev); break; case USB_DEVICE_ID_KYE_EASYPEN_M406: case USB_DEVICE_ID_KYE_EASYPEN_M506: case USB_DEVICE_ID_KYE_EASYPEN_I405X: case USB_DEVICE_ID_KYE_MOUSEPEN_I608X: case USB_DEVICE_ID_KYE_EASYPEN_M406W: case USB_DEVICE_ID_KYE_EASYPEN_M610X: case USB_DEVICE_ID_KYE_EASYPEN_340: case USB_DEVICE_ID_KYE_PENSKETCH_M912: case USB_DEVICE_ID_KYE_MOUSEPEN_M508WX: case USB_DEVICE_ID_KYE_MOUSEPEN_M508X: case USB_DEVICE_ID_KYE_EASYPEN_M406XE: case USB_DEVICE_ID_KYE_MOUSEPEN_I608X_V2: case USB_DEVICE_ID_KYE_PENSKETCH_T609A: ret = kye_tablet_enable(hdev); if (ret) { hid_err(hdev, "tablet enabling failed\n"); goto enabling_err; } break; } return 0; enabling_err: hid_hw_stop(hdev); err: return ret; } static const struct hid_device_id kye_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_ERGO_525V) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_GENIUS_GILA_GAMING_MOUSE) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_GENIUS_MANTICORE) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_GENIUS_GX_IMPERATOR) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_M406) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_M506) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_I405X) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_MOUSEPEN_I608X) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_M406W) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_M610X) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_340) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_PENSKETCH_M912) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_MOUSEPEN_M508WX) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_MOUSEPEN_M508X) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_M406XE) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_MOUSEPEN_I608X_V2) }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_PENSKETCH_T609A) }, { } }; MODULE_DEVICE_TABLE(hid, kye_devices); static struct hid_driver kye_driver = { .name = "kye", .id_table = kye_devices, .probe = kye_probe, .report_fixup = kye_report_fixup, }; module_hid_driver(kye_driver); MODULE_DESCRIPTION("HID driver for Kye/Genius devices not fully compliant with HID standard"); MODULE_LICENSE("GPL"); |
11 1 1 1 2 2 2 4 1 3 2 3 1 2 3 1 2 22 12 12 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2017 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/openvswitch.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <net/netlink.h> #include <net/genetlink.h> #include "datapath.h" #include "meter.h" static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED }, [OVS_METER_ATTR_USED] = { .type = NLA_U64 }, [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG }, [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 }, [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 }, }; static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, }, [OVS_BAND_ATTR_RATE] = { .type = NLA_U32, }, [OVS_BAND_ATTR_BURST] = { .type = NLA_U32, }, [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, }; static u32 meter_hash(struct dp_meter_instance *ti, u32 id) { return id % ti->n_meters; } static void ovs_meter_free(struct dp_meter *meter) { if (!meter) return; kfree_rcu(meter, rcu); } /* Call with ovs_mutex or RCU read lock. */ static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl, u32 meter_id) { struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); u32 hash = meter_hash(ti, meter_id); struct dp_meter *meter; meter = rcu_dereference_ovsl(ti->dp_meters[hash]); if (meter && likely(meter->id == meter_id)) return meter; return NULL; } static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) { struct dp_meter_instance *ti; ti = kvzalloc(struct_size(ti, dp_meters, size), GFP_KERNEL); if (!ti) return NULL; ti->n_meters = size; return ti; } static void dp_meter_instance_free(struct dp_meter_instance *ti) { kvfree(ti); } static void dp_meter_instance_free_rcu(struct rcu_head *rcu) { struct dp_meter_instance *ti; ti = container_of(rcu, struct dp_meter_instance, rcu); kvfree(ti); } static int dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size) { struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); int n_meters = min(size, ti->n_meters); struct dp_meter_instance *new_ti; int i; new_ti = dp_meter_instance_alloc(size); if (!new_ti) return -ENOMEM; for (i = 0; i < n_meters; i++) if (rcu_dereference_ovsl(ti->dp_meters[i])) new_ti->dp_meters[i] = ti->dp_meters[i]; rcu_assign_pointer(tbl->ti, new_ti); call_rcu(&ti->rcu, dp_meter_instance_free_rcu); return 0; } static void dp_meter_instance_insert(struct dp_meter_instance *ti, struct dp_meter *meter) { u32 hash; hash = meter_hash(ti, meter->id); rcu_assign_pointer(ti->dp_meters[hash], meter); } static void dp_meter_instance_remove(struct dp_meter_instance *ti, struct dp_meter *meter) { u32 hash; hash = meter_hash(ti, meter->id); RCU_INIT_POINTER(ti->dp_meters[hash], NULL); } static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) { struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); u32 hash = meter_hash(ti, meter->id); int err; /* In generally, slots selected should be empty, because * OvS uses id-pool to fetch a available id. */ if (unlikely(rcu_dereference_ovsl(ti->dp_meters[hash]))) return -EBUSY; dp_meter_instance_insert(ti, meter); /* That function is thread-safe. */ tbl->count++; if (tbl->count >= tbl->max_meters_allowed) { err = -EFBIG; goto attach_err; } if (tbl->count >= ti->n_meters && dp_meter_instance_realloc(tbl, ti->n_meters * 2)) { err = -ENOMEM; goto attach_err; } return 0; attach_err: dp_meter_instance_remove(ti, meter); tbl->count--; return err; } static int detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) { struct dp_meter_instance *ti; ASSERT_OVSL(); if (!meter) return 0; ti = rcu_dereference_ovsl(tbl->ti); dp_meter_instance_remove(ti, meter); tbl->count--; /* Shrink the meter array if necessary. */ if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN && tbl->count <= (ti->n_meters / 4)) { int half_size = ti->n_meters / 2; int i; /* Avoid hash collision, don't move slots to other place. * Make sure there are no references of meters in array * which will be released. */ for (i = half_size; i < ti->n_meters; i++) if (rcu_dereference_ovsl(ti->dp_meters[i])) goto out; if (dp_meter_instance_realloc(tbl, half_size)) goto shrink_err; } out: return 0; shrink_err: dp_meter_instance_insert(ti, meter); tbl->count++; return -ENOMEM; } static struct sk_buff * ovs_meter_cmd_reply_start(struct genl_info *info, u8 cmd, struct ovs_header **ovs_reply_header) { struct sk_buff *skb; struct ovs_header *ovs_header = genl_info_userhdr(info); skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return ERR_PTR(-ENOMEM); *ovs_reply_header = genlmsg_put(skb, info->snd_portid, info->snd_seq, &dp_meter_genl_family, 0, cmd); if (!*ovs_reply_header) { nlmsg_free(skb); return ERR_PTR(-EMSGSIZE); } (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex; return skb; } static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, struct dp_meter *meter) { struct nlattr *nla; struct dp_meter_band *band; u16 i; if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id)) goto error; if (nla_put(reply, OVS_METER_ATTR_STATS, sizeof(struct ovs_flow_stats), &meter->stats)) goto error; if (nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, OVS_METER_ATTR_PAD)) goto error; nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); if (!nla) goto error; band = meter->bands; for (i = 0; i < meter->n_bands; ++i, ++band) { struct nlattr *band_nla; band_nla = nla_nest_start_noflag(reply, OVS_BAND_ATTR_UNSPEC); if (!band_nla || nla_put(reply, OVS_BAND_ATTR_STATS, sizeof(struct ovs_flow_stats), &band->stats)) goto error; nla_nest_end(reply, band_nla); } nla_nest_end(reply, nla); return 0; error: return -EMSGSIZE; } static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr *nla, *band_nla; struct sk_buff *reply; struct datapath *dp; int err = -EMSGSIZE; reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, dp->meter_tbl.max_meters_allowed)) goto exit_unlock; ovs_unlock(); if (nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) goto nla_put_failure; nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); if (!nla) goto nla_put_failure; band_nla = nla_nest_start_noflag(reply, OVS_BAND_ATTR_UNSPEC); if (!band_nla) goto nla_put_failure; /* Currently only DROP band type is supported. */ if (nla_put_u32(reply, OVS_BAND_ATTR_TYPE, OVS_METER_BAND_TYPE_DROP)) goto nla_put_failure; nla_nest_end(reply, band_nla); nla_nest_end(reply, nla); genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_unlock: ovs_unlock(); nla_put_failure: nlmsg_free(reply); return err; } static struct dp_meter *dp_meter_create(struct nlattr **a) { struct nlattr *nla; int rem; u16 n_bands = 0; struct dp_meter *meter; struct dp_meter_band *band; int err; /* Validate attributes, count the bands. */ if (!a[OVS_METER_ATTR_BANDS]) return ERR_PTR(-EINVAL); nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) if (++n_bands > DP_MAX_BANDS) return ERR_PTR(-EINVAL); /* Allocate and set up the meter before locking anything. */ meter = kzalloc(struct_size(meter, bands, n_bands), GFP_KERNEL_ACCOUNT); if (!meter) return ERR_PTR(-ENOMEM); meter->id = nla_get_u32(a[OVS_METER_ATTR_ID]); meter->used = div_u64(ktime_get_ns(), 1000 * 1000); meter->kbps = a[OVS_METER_ATTR_KBPS] ? 1 : 0; meter->keep_stats = !a[OVS_METER_ATTR_CLEAR]; spin_lock_init(&meter->lock); if (meter->keep_stats && a[OVS_METER_ATTR_STATS]) { meter->stats = *(struct ovs_flow_stats *) nla_data(a[OVS_METER_ATTR_STATS]); } meter->n_bands = n_bands; /* Set up meter bands. */ band = meter->bands; nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) { struct nlattr *attr[OVS_BAND_ATTR_MAX + 1]; u32 band_max_delta_t; err = nla_parse_deprecated((struct nlattr **)&attr, OVS_BAND_ATTR_MAX, nla_data(nla), nla_len(nla), band_policy, NULL); if (err) goto exit_free_meter; if (!attr[OVS_BAND_ATTR_TYPE] || !attr[OVS_BAND_ATTR_RATE] || !attr[OVS_BAND_ATTR_BURST]) { err = -EINVAL; goto exit_free_meter; } band->type = nla_get_u32(attr[OVS_BAND_ATTR_TYPE]); band->rate = nla_get_u32(attr[OVS_BAND_ATTR_RATE]); if (band->rate == 0) { err = -EINVAL; goto exit_free_meter; } band->burst_size = nla_get_u32(attr[OVS_BAND_ATTR_BURST]); /* Figure out max delta_t that is enough to fill any bucket. * Keep max_delta_t size to the bucket units: * pkts => 1/1000 packets, kilobits => bits. * * Start with a full bucket. */ band->bucket = band->burst_size * 1000ULL; band_max_delta_t = div_u64(band->bucket, band->rate); if (band_max_delta_t > meter->max_delta_t) meter->max_delta_t = band_max_delta_t; band++; } return meter; exit_free_meter: kfree(meter); return ERR_PTR(err); } static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct dp_meter *meter, *old_meter; struct sk_buff *reply; struct ovs_header *ovs_reply_header; struct ovs_header *ovs_header = genl_info_userhdr(info); struct dp_meter_table *meter_tbl; struct datapath *dp; int err; u32 meter_id; bool failed; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; meter = dp_meter_create(a); if (IS_ERR(meter)) return PTR_ERR(meter); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET, &ovs_reply_header); if (IS_ERR(reply)) { err = PTR_ERR(reply); goto exit_free_meter; } ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } meter_tbl = &dp->meter_tbl; meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); old_meter = lookup_meter(meter_tbl, meter_id); err = detach_meter(meter_tbl, old_meter); if (err) goto exit_unlock; err = attach_meter(meter_tbl, meter); if (err) goto exit_free_old_meter; ovs_unlock(); /* Build response with the meter_id and stats from * the old meter, if any. */ failed = nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id); WARN_ON(failed); if (old_meter) { spin_lock_bh(&old_meter->lock); if (old_meter->keep_stats) { err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); WARN_ON(err); } spin_unlock_bh(&old_meter->lock); ovs_meter_free(old_meter); } genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_free_old_meter: ovs_meter_free(old_meter); exit_unlock: ovs_unlock(); nlmsg_free(reply); exit_free_meter: kfree(meter); return err; } static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr **a = info->attrs; struct dp_meter *meter; struct sk_buff *reply; struct datapath *dp; u32 meter_id; int err; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_GET, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } /* Locate meter, copy stats. */ meter = lookup_meter(&dp->meter_tbl, meter_id); if (!meter) { err = -ENOENT; goto exit_unlock; } spin_lock_bh(&meter->lock); err = ovs_meter_cmd_reply_stats(reply, meter_id, meter); spin_unlock_bh(&meter->lock); if (err) goto exit_unlock; ovs_unlock(); genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_unlock: ovs_unlock(); nlmsg_free(reply); return err; } static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr **a = info->attrs; struct dp_meter *old_meter; struct sk_buff *reply; struct datapath *dp; u32 meter_id; int err; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); old_meter = lookup_meter(&dp->meter_tbl, meter_id); if (old_meter) { spin_lock_bh(&old_meter->lock); err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); WARN_ON(err); spin_unlock_bh(&old_meter->lock); err = detach_meter(&dp->meter_tbl, old_meter); if (err) goto exit_unlock; } ovs_unlock(); ovs_meter_free(old_meter); genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_unlock: ovs_unlock(); nlmsg_free(reply); return err; } /* Meter action execution. * * Return true 'meter_id' drop band is triggered. The 'skb' should be * dropped by the caller'. */ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, u32 meter_id) { long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000); long long int long_delta_ms; struct dp_meter_band *band; struct dp_meter *meter; int i, band_exceeded_max = -1; u32 band_exceeded_rate = 0; u32 delta_ms; u32 cost; meter = lookup_meter(&dp->meter_tbl, meter_id); /* Do not drop the packet when there is no meter. */ if (!meter) return false; /* Lock the meter while using it. */ spin_lock(&meter->lock); long_delta_ms = (now_ms - meter->used); /* ms */ if (long_delta_ms < 0) { /* This condition means that we have several threads fighting * for a meter lock, and the one who received the packets a * bit later wins. Assuming that all racing threads received * packets at the same time to avoid overflow. */ long_delta_ms = 0; } /* Make sure delta_ms will not be too large, so that bucket will not * wrap around below. */ delta_ms = (long_delta_ms > (long long int)meter->max_delta_t) ? meter->max_delta_t : (u32)long_delta_ms; /* Update meter statistics. */ meter->used = now_ms; meter->stats.n_packets += 1; meter->stats.n_bytes += skb->len; /* Bucket rate is either in kilobits per second, or in packets per * second. We maintain the bucket in the units of either bits or * 1/1000th of a packet, correspondingly. * Then, when rate is multiplied with milliseconds, we get the * bucket units: * msec * kbps = bits, and * msec * packets/sec = 1/1000 packets. * * 'cost' is the number of bucket units in this packet. */ cost = (meter->kbps) ? skb->len * 8 : 1000; /* Update all bands and find the one hit with the highest rate. */ for (i = 0; i < meter->n_bands; ++i) { long long int max_bucket_size; band = &meter->bands[i]; max_bucket_size = band->burst_size * 1000LL; band->bucket += delta_ms * band->rate; if (band->bucket > max_bucket_size) band->bucket = max_bucket_size; if (band->bucket >= cost) { band->bucket -= cost; } else if (band->rate > band_exceeded_rate) { band_exceeded_rate = band->rate; band_exceeded_max = i; } } if (band_exceeded_max >= 0) { /* Update band statistics. */ band = &meter->bands[band_exceeded_max]; band->stats.n_packets += 1; band->stats.n_bytes += skb->len; /* Drop band triggered, let the caller drop the 'skb'. */ if (band->type == OVS_METER_BAND_TYPE_DROP) { spin_unlock(&meter->lock); return true; } } spin_unlock(&meter->lock); return false; } static const struct genl_small_ops dp_meter_genl_ops[] = { { .cmd = OVS_METER_CMD_FEATURES, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_meter_cmd_features }, { .cmd = OVS_METER_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN * privilege. */ .doit = ovs_meter_cmd_set, }, { .cmd = OVS_METER_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_meter_cmd_get, }, { .cmd = OVS_METER_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN * privilege. */ .doit = ovs_meter_cmd_del }, }; static const struct genl_multicast_group ovs_meter_multicast_group = { .name = OVS_METER_MCGROUP, }; struct genl_family dp_meter_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_METER_FAMILY, .version = OVS_METER_VERSION, .maxattr = OVS_METER_ATTR_MAX, .policy = meter_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_meter_genl_ops, .n_small_ops = ARRAY_SIZE(dp_meter_genl_ops), .resv_start_op = OVS_METER_CMD_GET + 1, .mcgrps = &ovs_meter_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, }; int ovs_meters_init(struct datapath *dp) { struct dp_meter_table *tbl = &dp->meter_tbl; struct dp_meter_instance *ti; unsigned long free_mem_bytes; ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN); if (!ti) return -ENOMEM; /* Allow meters in a datapath to use ~3.12% of physical memory. */ free_mem_bytes = nr_free_buffer_pages() * (PAGE_SIZE >> 5); tbl->max_meters_allowed = min(free_mem_bytes / sizeof(struct dp_meter), DP_METER_NUM_MAX); if (!tbl->max_meters_allowed) goto out_err; rcu_assign_pointer(tbl->ti, ti); tbl->count = 0; return 0; out_err: dp_meter_instance_free(ti); return -ENOMEM; } void ovs_meters_exit(struct datapath *dp) { struct dp_meter_table *tbl = &dp->meter_tbl; struct dp_meter_instance *ti = rcu_dereference_raw(tbl->ti); int i; for (i = 0; i < ti->n_meters; i++) ovs_meter_free(rcu_dereference_raw(ti->dp_meters[i])); dp_meter_instance_free(ti); } |
267 32 257 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_TIMEOUT_H #define _NF_CONNTRACK_TIMEOUT_H #include <net/net_namespace.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <linux/refcount.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #define CTNL_TIMEOUT_NAME_MAX 32 struct nf_ct_timeout { __u16 l3num; const struct nf_conntrack_l4proto *l4proto; char data[]; }; struct nf_conn_timeout { struct nf_ct_timeout __rcu *timeout; }; static inline unsigned int * nf_ct_timeout_data(const struct nf_conn_timeout *t) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_ct_timeout *timeout; timeout = rcu_dereference(t->timeout); if (timeout == NULL) return NULL; return (unsigned int *)timeout->data; #else return NULL; #endif } static inline struct nf_conn_timeout *nf_ct_timeout_find(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT return nf_ct_ext_find(ct, NF_CT_EXT_TIMEOUT); #else return NULL; #endif } static inline struct nf_conn_timeout *nf_ct_timeout_ext_add(struct nf_conn *ct, struct nf_ct_timeout *timeout, gfp_t gfp) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_conn_timeout *timeout_ext; timeout_ext = nf_ct_ext_add(ct, NF_CT_EXT_TIMEOUT, gfp); if (timeout_ext == NULL) return NULL; rcu_assign_pointer(timeout_ext->timeout, timeout); return timeout_ext; #else return NULL; #endif }; static inline unsigned int *nf_ct_timeout_lookup(const struct nf_conn *ct) { unsigned int *timeouts = NULL; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_conn_timeout *timeout_ext; timeout_ext = nf_ct_timeout_find(ct); if (timeout_ext) timeouts = nf_ct_timeout_data(timeout_ext); #endif return timeouts; } #ifdef CONFIG_NF_CONNTRACK_TIMEOUT void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout); int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name); void nf_ct_destroy_timeout(struct nf_conn *ct); #else static inline int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name) { return -EOPNOTSUPP; } static inline void nf_ct_destroy_timeout(struct nf_conn *ct) { return; } #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ #ifdef CONFIG_NF_CONNTRACK_TIMEOUT struct nf_ct_timeout_hooks { struct nf_ct_timeout *(*timeout_find_get)(struct net *net, const char *name); void (*timeout_put)(struct nf_ct_timeout *timeout); }; extern const struct nf_ct_timeout_hooks __rcu *nf_ct_timeout_hook; #endif #endif /* _NF_CONNTRACK_TIMEOUT_H */ |
11 11 6 6 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Roccat common functions for device specific drivers * * Copyright (c) 2011 Stefan Achatz <erazor_de@users.sourceforge.net> */ /* */ #include <linux/hid.h> #include <linux/slab.h> #include <linux/module.h> #include "hid-roccat-common.h" static inline uint16_t roccat_common2_feature_report(uint8_t report_id) { return 0x300 | report_id; } int roccat_common2_receive(struct usb_device *usb_dev, uint report_id, void *data, uint size) { char *buf; int len; buf = kmalloc(size, GFP_KERNEL); if (buf == NULL) return -ENOMEM; len = usb_control_msg(usb_dev, usb_rcvctrlpipe(usb_dev, 0), HID_REQ_GET_REPORT, USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_IN, roccat_common2_feature_report(report_id), 0, buf, size, USB_CTRL_SET_TIMEOUT); memcpy(data, buf, size); kfree(buf); return ((len < 0) ? len : ((len != size) ? -EIO : 0)); } EXPORT_SYMBOL_GPL(roccat_common2_receive); int roccat_common2_send(struct usb_device *usb_dev, uint report_id, void const *data, uint size) { char *buf; int len; buf = kmemdup(data, size, GFP_KERNEL); if (buf == NULL) return -ENOMEM; len = usb_control_msg(usb_dev, usb_sndctrlpipe(usb_dev, 0), HID_REQ_SET_REPORT, USB_TYPE_CLASS | USB_RECIP_INTERFACE | USB_DIR_OUT, roccat_common2_feature_report(report_id), 0, buf, size, USB_CTRL_SET_TIMEOUT); kfree(buf); return ((len < 0) ? len : ((len != size) ? -EIO : 0)); } EXPORT_SYMBOL_GPL(roccat_common2_send); enum roccat_common2_control_states { ROCCAT_COMMON_CONTROL_STATUS_CRITICAL = 0, ROCCAT_COMMON_CONTROL_STATUS_OK = 1, ROCCAT_COMMON_CONTROL_STATUS_INVALID = 2, ROCCAT_COMMON_CONTROL_STATUS_BUSY = 3, ROCCAT_COMMON_CONTROL_STATUS_CRITICAL_NEW = 4, }; static int roccat_common2_receive_control_status(struct usb_device *usb_dev) { int retval; struct roccat_common2_control control; do { msleep(50); retval = roccat_common2_receive(usb_dev, ROCCAT_COMMON_COMMAND_CONTROL, &control, sizeof(struct roccat_common2_control)); if (retval) return retval; switch (control.value) { case ROCCAT_COMMON_CONTROL_STATUS_OK: return 0; case ROCCAT_COMMON_CONTROL_STATUS_BUSY: msleep(500); continue; case ROCCAT_COMMON_CONTROL_STATUS_INVALID: case ROCCAT_COMMON_CONTROL_STATUS_CRITICAL: case ROCCAT_COMMON_CONTROL_STATUS_CRITICAL_NEW: return -EINVAL; default: dev_err(&usb_dev->dev, "roccat_common2_receive_control_status: " "unknown response value 0x%x\n", control.value); return -EINVAL; } } while (1); } int roccat_common2_send_with_status(struct usb_device *usb_dev, uint command, void const *buf, uint size) { int retval; retval = roccat_common2_send(usb_dev, command, buf, size); if (retval) return retval; msleep(100); return roccat_common2_receive_control_status(usb_dev); } EXPORT_SYMBOL_GPL(roccat_common2_send_with_status); int roccat_common2_device_init_struct(struct usb_device *usb_dev, struct roccat_common2_device *dev) { mutex_init(&dev->lock); return 0; } EXPORT_SYMBOL_GPL(roccat_common2_device_init_struct); ssize_t roccat_common2_sysfs_read(struct file *fp, struct kobject *kobj, char *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct roccat_common2_device *roccat_dev = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off >= real_size) return 0; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&roccat_dev->lock); retval = roccat_common2_receive(usb_dev, command, buf, real_size); mutex_unlock(&roccat_dev->lock); return retval ? retval : real_size; } EXPORT_SYMBOL_GPL(roccat_common2_sysfs_read); ssize_t roccat_common2_sysfs_write(struct file *fp, struct kobject *kobj, void const *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct roccat_common2_device *roccat_dev = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&roccat_dev->lock); retval = roccat_common2_send_with_status(usb_dev, command, buf, real_size); mutex_unlock(&roccat_dev->lock); return retval ? retval : real_size; } EXPORT_SYMBOL_GPL(roccat_common2_sysfs_write); MODULE_AUTHOR("Stefan Achatz"); MODULE_DESCRIPTION("USB Roccat common driver"); MODULE_LICENSE("GPL v2"); |
1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | // SPDX-License-Identifier: GPL-2.0-only /* * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS * * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 * Wensong Zhang <wensong@linuxvirtualserver.org> */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/in.h> #include <linux/ip.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <net/ip_vs.h> /* TODO: struct isakmp_hdr { __u8 icookie[8]; __u8 rcookie[8]; __u8 np; __u8 version; __u8 xchgtype; __u8 flags; __u32 msgid; __u32 length; }; */ #define PORT_ISAKMP 500 static void ah_esp_conn_fill_param_proto(struct netns_ipvs *ipvs, int af, const struct ip_vs_iphdr *iph, struct ip_vs_conn_param *p) { if (likely(!ip_vs_iph_inverse(iph))) ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP, &iph->saddr, htons(PORT_ISAKMP), &iph->daddr, htons(PORT_ISAKMP), p); else ip_vs_conn_fill_param(ipvs, af, IPPROTO_UDP, &iph->daddr, htons(PORT_ISAKMP), &iph->saddr, htons(PORT_ISAKMP), p); } static struct ip_vs_conn * ah_esp_conn_in_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; ah_esp_conn_fill_param_proto(ipvs, af, iph, &p); cp = ip_vs_conn_in_get(&p); if (!cp) { /* * We are not sure if the packet is from our * service, so our conn_schedule hook should return NF_ACCEPT */ IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " "%s%s %s->%s\n", ip_vs_iph_icmp(iph) ? "ICMP+" : "", ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); } return cp; } static struct ip_vs_conn * ah_esp_conn_out_get(struct netns_ipvs *ipvs, int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; ah_esp_conn_fill_param_proto(ipvs, af, iph, &p); cp = ip_vs_conn_out_get(&p); if (!cp) { IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " "%s%s %s->%s\n", ip_vs_iph_icmp(iph) ? "ICMP+" : "", ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); } return cp; } static int ah_esp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { /* * AH/ESP is only related traffic. Pass the packet to IP stack. */ *verdict = NF_ACCEPT; return 0; } #ifdef CONFIG_IP_VS_PROTO_AH struct ip_vs_protocol ip_vs_protocol_ah = { .name = "AH", .protocol = IPPROTO_AH, .num_states = 1, .dont_defrag = 1, .init = NULL, .exit = NULL, .conn_schedule = ah_esp_conn_schedule, .conn_in_get = ah_esp_conn_in_get, .conn_out_get = ah_esp_conn_out_get, .snat_handler = NULL, .dnat_handler = NULL, .state_transition = NULL, .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ }; #endif #ifdef CONFIG_IP_VS_PROTO_ESP struct ip_vs_protocol ip_vs_protocol_esp = { .name = "ESP", .protocol = IPPROTO_ESP, .num_states = 1, .dont_defrag = 1, .init = NULL, .exit = NULL, .conn_schedule = ah_esp_conn_schedule, .conn_in_get = ah_esp_conn_in_get, .conn_out_get = ah_esp_conn_out_get, .snat_handler = NULL, .dnat_handler = NULL, .state_transition = NULL, .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ }; #endif |
67 67 181 182 1168 1173 640 52 60 55 1165 5 1169 401 396 789 13 50 386 386 387 385 387 1252 1252 1254 1251 9 1246 492 491 140 491 492 182 182 181 182 139 42 8 2 2 182 182 174 8 74 74 74 68 1 35 27 7 1 4 30 1 67 32 18 2 32 24 5 20 224 227 9 220 226 5 5 5 5 6 6 3 3 16 1 1 2 13 13 12 10 2 5 3 5 4 27 25 6 25 46 1 1 1 1 27 11 4 2 31 1 31 7 24 31 31 25 7 3 23 8 28 3 31 25 3 42 42 2 32 31 27 24 2 1 1 2 6 8 28 10 18 10 1 2 106 105 2 64 42 4 44 17 89 91 2 86 1 3 3 47 86 2 45 10 20 41 41 25 6 1 18 4 19 2 23 24 1 1 1 13 8 5 6 6 3 6 3 2 1 2 30 30 21 28 256 255 256 255 1 255 256 1 39 217 1 24 238 30 22 30 12 30 256 65 18 34 19 2 1 1 1 2090 2089 2088 1474 9 1225 85 2 12 1444 2 493 13 549 1 463 47 45 205 492 34 34 240 239 240 236 5 7 238 239 223 35 4 238 238 1 1 234 8 240 4 1 2 2 5 12 3 13 1 10 7 7 4 6 24 24 4 20 1 1 1 2 1 2 5 5 2 5 24 24 233 234 234 2050 2051 2135 2132 487 68 18 18 4 11 2 3 10 10 1 3 7 4 1496 1496 1496 1494 1494 1506 1 1 526 1308 1295 1293 1309 1295 1296 10 1493 1497 1492 2 6 4 15 8 7 2 2 1 1 2 2 5 3 2 5 13 10 1 2 11 11 11 9 11 4 4 11 1278 1277 2 525 1285 11 1277 525 85 85 85 85 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 | // SPDX-License-Identifier: GPL-2.0-or-later /* * NET3 IP device support routines. * * Derived from the IP parts of dev.c 1.0.19 * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * * Additional Authors: * Alan Cox, <gw4pts@gw4pts.ampr.org> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Changes: * Alexey Kuznetsov: pa_* fields are replaced with ifaddr * lists. * Cyrus Durgin: updated for kmod * Matthias Andree: in devinet_ioctl, compare label and * address (4.4BSD alias style support), * fall back to comparing just the label * if no match found. */ #include <linux/uaccess.h> #include <linux/bitops.h> #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/interrupt.h> #include <linux/if_addr.h> #include <linux/if_ether.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/notifier.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/slab.h> #include <linux/hash.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/kmod.h> #include <linux/netconf.h> #include <net/arp.h> #include <net/ip.h> #include <net/route.h> #include <net/ip_fib.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/addrconf.h> #define IPV6ONLY_FLAGS \ (IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \ IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \ IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY) static struct ipv4_devconf ipv4_devconf = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1, }, }; static struct ipv4_devconf ipv4_devconf_dflt = { .data = { [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1, }, }; #define IPV4_DEVCONF_DFLT(net, attr) \ IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr) static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U32 }, [IFA_ADDRESS] = { .type = NLA_U32 }, [IFA_BROADCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, [IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) }, [IFA_FLAGS] = { .type = NLA_U32 }, [IFA_RT_PRIORITY] = { .type = NLA_U32 }, [IFA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFA_PROTO] = { .type = NLA_U8 }, }; struct inet_fill_args { u32 portid; u32 seq; int event; unsigned int flags; int netnsid; int ifindex; }; #define IN4_ADDR_HSIZE_SHIFT 8 #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) static u32 inet_addr_hash(const struct net *net, __be32 addr) { u32 val = __ipv4_addr_hash(addr, net_hash_mix(net)); return hash_32(val, IN4_ADDR_HSIZE_SHIFT); } static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa) { u32 hash = inet_addr_hash(net, ifa->ifa_local); ASSERT_RTNL(); hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]); } static void inet_hash_remove(struct in_ifaddr *ifa) { ASSERT_RTNL(); hlist_del_init_rcu(&ifa->addr_lst); } /** * __ip_dev_find - find the first device with a given source address. * @net: the net namespace * @addr: the source address * @devref: if true, take a reference on the found device * * If a caller uses devref=false, it should be protected by RCU, or RTNL */ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { struct net_device *result = NULL; struct in_ifaddr *ifa; rcu_read_lock(); ifa = inet_lookup_ifaddr_rcu(net, addr); if (!ifa) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res = { 0 }; struct fib_table *local; /* Fallback to FIB local table so that communication * over loopback subnets work. */ local = fib_get_table(net, RT_TABLE_LOCAL); if (local && !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) && res.type == RTN_LOCAL) result = FIB_RES_DEV(res); } else { result = ifa->ifa_dev->dev; } if (result && devref) dev_hold(result); rcu_read_unlock(); return result; } EXPORT_SYMBOL(__ip_dev_find); /* called under RCU lock */ struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr) { u32 hash = inet_addr_hash(net, addr); struct in_ifaddr *ifa; hlist_for_each_entry_rcu(ifa, &net->ipv4.inet_addr_lst[hash], addr_lst) if (ifa->ifa_local == addr) return ifa; return NULL; } static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32); static BLOCKING_NOTIFIER_HEAD(inetaddr_chain); static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain); static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy); #ifdef CONFIG_SYSCTL static int devinet_sysctl_register(struct in_device *idev); static void devinet_sysctl_unregister(struct in_device *idev); #else static int devinet_sysctl_register(struct in_device *idev) { return 0; } static void devinet_sysctl_unregister(struct in_device *idev) { } #endif /* Locks all the inet devices. */ static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev) { struct in_ifaddr *ifa; ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_ACCOUNT); if (!ifa) return NULL; in_dev_hold(in_dev); ifa->ifa_dev = in_dev; INIT_HLIST_NODE(&ifa->addr_lst); return ifa; } static void inet_rcu_free_ifa(struct rcu_head *head) { struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head); in_dev_put(ifa->ifa_dev); kfree(ifa); } static void inet_free_ifa(struct in_ifaddr *ifa) { /* Our reference to ifa->ifa_dev must be freed ASAP * to release the reference to the netdev the same way. * in_dev_put() -> in_dev_finish_destroy() -> netdev_put() */ call_rcu_hurry(&ifa->rcu_head, inet_rcu_free_ifa); } static void in_dev_free_rcu(struct rcu_head *head) { struct in_device *idev = container_of(head, struct in_device, rcu_head); kfree(rcu_dereference_protected(idev->mc_hash, 1)); kfree(idev); } void in_dev_finish_destroy(struct in_device *idev) { struct net_device *dev = idev->dev; WARN_ON(idev->ifa_list); WARN_ON(idev->mc_list); #ifdef NET_REFCNT_DEBUG pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL"); #endif netdev_put(dev, &idev->dev_tracker); if (!idev->dead) pr_err("Freeing alive in_device %p\n", idev); else call_rcu(&idev->rcu_head, in_dev_free_rcu); } EXPORT_SYMBOL(in_dev_finish_destroy); static struct in_device *inetdev_init(struct net_device *dev) { struct in_device *in_dev; int err = -ENOMEM; ASSERT_RTNL(); in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL); if (!in_dev) goto out; memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt, sizeof(in_dev->cnf)); in_dev->cnf.sysctl = NULL; in_dev->dev = dev; in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl); if (!in_dev->arp_parms) goto out_kfree; if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) dev_disable_lro(dev); /* Reference in_dev->dev */ netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL); /* Account for reference dev->ip_ptr (below) */ refcount_set(&in_dev->refcnt, 1); if (dev != blackhole_netdev) { err = devinet_sysctl_register(in_dev); if (err) { in_dev->dead = 1; neigh_parms_release(&arp_tbl, in_dev->arp_parms); in_dev_put(in_dev); in_dev = NULL; goto out; } ip_mc_init_dev(in_dev); if (dev->flags & IFF_UP) ip_mc_up(in_dev); } /* we can receive as soon as ip_ptr is set -- do this last */ rcu_assign_pointer(dev->ip_ptr, in_dev); out: return in_dev ?: ERR_PTR(err); out_kfree: kfree(in_dev); in_dev = NULL; goto out; } static void inetdev_destroy(struct in_device *in_dev) { struct net_device *dev; struct in_ifaddr *ifa; ASSERT_RTNL(); dev = in_dev->dev; in_dev->dead = 1; ip_mc_destroy_dev(in_dev); while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) { inet_del_ifa(in_dev, &in_dev->ifa_list, 0); inet_free_ifa(ifa); } RCU_INIT_POINTER(dev->ip_ptr, NULL); devinet_sysctl_unregister(in_dev); neigh_parms_release(&arp_tbl, in_dev->arp_parms); arp_ifdown(dev); in_dev_put(in_dev); } static int __init inet_blackhole_dev_init(void) { int err = 0; rtnl_lock(); if (!inetdev_init(blackhole_netdev)) err = -ENOMEM; rtnl_unlock(); return err; } late_initcall(inet_blackhole_dev_init); int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b) { const struct in_ifaddr *ifa; rcu_read_lock(); in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(a, ifa)) { if (!b || inet_ifa_match(b, ifa)) { rcu_read_unlock(); return 1; } } } rcu_read_unlock(); return 0; } static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy, struct nlmsghdr *nlh, u32 portid) { struct in_ifaddr *promote = NULL; struct in_ifaddr *ifa, *ifa1; struct in_ifaddr __rcu **last_prim; struct in_ifaddr *prev_prom = NULL; int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev); ASSERT_RTNL(); ifa1 = rtnl_dereference(*ifap); last_prim = ifap; if (in_dev->dead) goto no_promotions; /* 1. Deleting primary ifaddr forces deletion all secondaries * unless alias promotion is set **/ if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next; while ((ifa = rtnl_dereference(*ifap1)) != NULL) { if (!(ifa->ifa_flags & IFA_F_SECONDARY) && ifa1->ifa_scope <= ifa->ifa_scope) last_prim = &ifa->ifa_next; if (!(ifa->ifa_flags & IFA_F_SECONDARY) || ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) { ifap1 = &ifa->ifa_next; prev_prom = ifa; continue; } if (!do_promote) { inet_hash_remove(ifa); *ifap1 = ifa->ifa_next; rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); inet_free_ifa(ifa); } else { promote = ifa; break; } } } /* On promotion all secondaries from subnet are changing * the primary IP, we must remove all their routes silently * and later to add them back with new prefsrc. Do this * while all addresses are on the device list. */ for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) fib_del_ifaddr(ifa, ifa1); } no_promotions: /* 2. Unlink it */ *ifap = ifa1->ifa_next; inet_hash_remove(ifa1); /* 3. Announce address deletion */ /* Send message first, then call notifier. At first sight, FIB update triggered by notifier will refer to already deleted ifaddr, that could confuse netlink listeners. It is not true: look, gated sees that route deleted and if it still thinks that ifaddr is valid, it will try to restore deleted routes... Grr. So that, this order is correct. */ rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); if (promote) { struct in_ifaddr *next_sec; next_sec = rtnl_dereference(promote->ifa_next); if (prev_prom) { struct in_ifaddr *last_sec; rcu_assign_pointer(prev_prom->ifa_next, next_sec); last_sec = rtnl_dereference(*last_prim); rcu_assign_pointer(promote->ifa_next, last_sec); rcu_assign_pointer(*last_prim, promote); } promote->ifa_flags &= ~IFA_F_SECONDARY; rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, promote); for (ifa = next_sec; ifa; ifa = rtnl_dereference(ifa->ifa_next)) { if (ifa1->ifa_mask != ifa->ifa_mask || !inet_ifa_match(ifa1->ifa_address, ifa)) continue; fib_add_ifaddr(ifa); } } if (destroy) inet_free_ifa(ifa1); } static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr __rcu **ifap, int destroy) { __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); } static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid, struct netlink_ext_ack *extack) { struct in_ifaddr __rcu **last_primary, **ifap; struct in_device *in_dev = ifa->ifa_dev; struct net *net = dev_net(in_dev->dev); struct in_validator_info ivi; struct in_ifaddr *ifa1; int ret; ASSERT_RTNL(); ifa->ifa_flags &= ~IFA_F_SECONDARY; last_primary = &in_dev->ifa_list; /* Don't set IPv6 only flags to IPv4 addresses */ ifa->ifa_flags &= ~IPV6ONLY_FLAGS; ifap = &in_dev->ifa_list; ifa1 = rtnl_dereference(*ifap); while (ifa1) { if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && ifa->ifa_scope <= ifa1->ifa_scope) last_primary = &ifa1->ifa_next; if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa)) { if (ifa1->ifa_local == ifa->ifa_local) { inet_free_ifa(ifa); return -EEXIST; } if (ifa1->ifa_scope != ifa->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid scope value"); inet_free_ifa(ifa); return -EINVAL; } ifa->ifa_flags |= IFA_F_SECONDARY; } ifap = &ifa1->ifa_next; ifa1 = rtnl_dereference(*ifap); } /* Allow any devices that wish to register ifaddr validtors to weigh * in now, before changes are committed. The rntl lock is serializing * access here, so the state should not change between a validator call * and a final notify on commit. This isn't invoked on promotion under * the assumption that validators are checking the address itself, and * not the flags. */ ivi.ivi_addr = ifa->ifa_address; ivi.ivi_dev = ifa->ifa_dev; ivi.extack = extack; ret = blocking_notifier_call_chain(&inetaddr_validator_chain, NETDEV_UP, &ivi); ret = notifier_to_errno(ret); if (ret) { inet_free_ifa(ifa); return ret; } if (!(ifa->ifa_flags & IFA_F_SECONDARY)) ifap = last_primary; rcu_assign_pointer(ifa->ifa_next, *ifap); rcu_assign_pointer(*ifap, ifa); inet_hash_insert(dev_net(in_dev->dev), ifa); cancel_delayed_work(&net->ipv4.addr_chk_work); queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); /* Send message first, then call notifier. Notifier will trigger FIB update, so that listeners of netlink will know about new ifaddr */ rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid); blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); return 0; } static int inet_insert_ifa(struct in_ifaddr *ifa) { if (!ifa->ifa_local) { inet_free_ifa(ifa); return 0; } return __inet_insert_ifa(ifa, NULL, 0, NULL); } static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) { struct in_device *in_dev = __in_dev_get_rtnl_net(dev); ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); if (ipv4_is_loopback(ifa->ifa_local)) ifa->ifa_scope = RT_SCOPE_HOST; return inet_insert_ifa(ifa); } /* Caller must hold RCU or RTNL : * We dont take a reference on found in_device */ struct in_device *inetdev_by_index(struct net *net, int ifindex) { struct net_device *dev; struct in_device *in_dev = NULL; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) in_dev = rcu_dereference_rtnl(dev->ip_ptr); rcu_read_unlock(); return in_dev; } EXPORT_SYMBOL(inetdev_by_index); /* Called only from RTNL semaphored context. No locks. */ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, __be32 mask) { struct in_ifaddr *ifa; ASSERT_RTNL(); in_dev_for_each_ifa_rtnl(ifa, in_dev) { if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) return ifa; } return NULL; } static int ip_mc_autojoin_config(struct net *net, bool join, const struct in_ifaddr *ifa) { #if defined(CONFIG_IP_MULTICAST) struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ifa->ifa_address, .imr_ifindex = ifa->ifa_dev->dev->ifindex, }; struct sock *sk = net->ipv4.mc_autojoin_sk; int ret; ASSERT_RTNL_NET(net); lock_sock(sk); if (join) ret = ip_mc_join_group(sk, &mreq); else ret = ip_mc_leave_group(sk, &mreq); release_sock(sk); return ret; #else return -EOPNOTSUPP; #endif } static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct in_ifaddr __rcu **ifap; struct nlattr *tb[IFA_MAX+1]; struct in_device *in_dev; struct ifaddrmsg *ifm; struct in_ifaddr *ifa; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) goto out; ifm = nlmsg_data(nlh); rtnl_net_lock(net); in_dev = inetdev_by_index(net, ifm->ifa_index); if (!in_dev) { NL_SET_ERR_MSG(extack, "ipv4: Device not found"); err = -ENODEV; goto unlock; } for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL])) continue; if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label)) continue; if (tb[IFA_ADDRESS] && (ifm->ifa_prefixlen != ifa->ifa_prefixlen || !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa))) continue; if (ipv4_is_multicast(ifa->ifa_address)) ip_mc_autojoin_config(net, false, ifa); __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); goto unlock; } NL_SET_ERR_MSG(extack, "ipv4: Address not found"); err = -EADDRNOTAVAIL; unlock: rtnl_net_unlock(net); out: return err; } static void check_lifetime(struct work_struct *work) { unsigned long now, next, next_sec, next_sched; struct in_ifaddr *ifa; struct hlist_node *n; struct net *net; int i; net = container_of(to_delayed_work(work), struct net, ipv4.addr_chk_work); now = jiffies; next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); for (i = 0; i < IN4_ADDR_HSIZE; i++) { struct hlist_head *head = &net->ipv4.inet_addr_lst[i]; bool change_needed = false; rcu_read_lock(); hlist_for_each_entry_rcu(ifa, head, addr_lst) { unsigned long age, tstamp; u32 preferred_lft; u32 valid_lft; u32 flags; flags = READ_ONCE(ifa->ifa_flags); if (flags & IFA_F_PERMANENT) continue; preferred_lft = READ_ONCE(ifa->ifa_preferred_lft); valid_lft = READ_ONCE(ifa->ifa_valid_lft); tstamp = READ_ONCE(ifa->ifa_tstamp); /* We try to batch several events at once. */ age = (now - tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (valid_lft != INFINITY_LIFE_TIME && age >= valid_lft) { change_needed = true; } else if (preferred_lft == INFINITY_LIFE_TIME) { continue; } else if (age >= preferred_lft) { if (time_before(tstamp + valid_lft * HZ, next)) next = tstamp + valid_lft * HZ; if (!(flags & IFA_F_DEPRECATED)) change_needed = true; } else if (time_before(tstamp + preferred_lft * HZ, next)) { next = tstamp + preferred_lft * HZ; } } rcu_read_unlock(); if (!change_needed) continue; rtnl_net_lock(net); hlist_for_each_entry_safe(ifa, n, head, addr_lst) { unsigned long age; if (ifa->ifa_flags & IFA_F_PERMANENT) continue; /* We try to batch several events at once. */ age = (now - ifa->ifa_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_valid_lft) { struct in_ifaddr __rcu **ifap; struct in_ifaddr *tmp; ifap = &ifa->ifa_dev->ifa_list; tmp = rtnl_net_dereference(net, *ifap); while (tmp) { if (tmp == ifa) { inet_del_ifa(ifa->ifa_dev, ifap, 1); break; } ifap = &tmp->ifa_next; tmp = rtnl_net_dereference(net, *ifap); } } else if (ifa->ifa_preferred_lft != INFINITY_LIFE_TIME && age >= ifa->ifa_preferred_lft && !(ifa->ifa_flags & IFA_F_DEPRECATED)) { ifa->ifa_flags |= IFA_F_DEPRECATED; rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } rtnl_net_unlock(net); } next_sec = round_jiffies_up(next); next_sched = next; /* If rounded timeout is accurate enough, accept it. */ if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ)) next_sched = next_sec; now = jiffies; /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */ if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, next_sched - now); } static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, __u32 prefered_lft) { unsigned long timeout; u32 flags; flags = ifa->ifa_flags & ~(IFA_F_PERMANENT | IFA_F_DEPRECATED); timeout = addrconf_timeout_fixup(valid_lft, HZ); if (addrconf_finite_timeout(timeout)) WRITE_ONCE(ifa->ifa_valid_lft, timeout); else flags |= IFA_F_PERMANENT; timeout = addrconf_timeout_fixup(prefered_lft, HZ); if (addrconf_finite_timeout(timeout)) { if (timeout == 0) flags |= IFA_F_DEPRECATED; WRITE_ONCE(ifa->ifa_preferred_lft, timeout); } WRITE_ONCE(ifa->ifa_flags, flags); WRITE_ONCE(ifa->ifa_tstamp, jiffies); if (!ifa->ifa_cstamp) WRITE_ONCE(ifa->ifa_cstamp, ifa->ifa_tstamp); } static int inet_validate_rtm(struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack, __u32 *valid_lft, __u32 *prefered_lft) { struct ifaddrmsg *ifm = nlmsg_data(nlh); int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) return err; if (ifm->ifa_prefixlen > 32) { NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length"); return -EINVAL; } if (!tb[IFA_LOCAL]) { NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied"); return -EINVAL; } if (tb[IFA_CACHEINFO]) { struct ifa_cacheinfo *ci; ci = nla_data(tb[IFA_CACHEINFO]); if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid"); return -EINVAL; } *valid_lft = ci->ifa_valid; *prefered_lft = ci->ifa_prefered; } return 0; } static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ifaddrmsg *ifm = nlmsg_data(nlh); struct in_device *in_dev; struct net_device *dev; struct in_ifaddr *ifa; int err; dev = __dev_get_by_index(net, ifm->ifa_index); err = -ENODEV; if (!dev) { NL_SET_ERR_MSG(extack, "ipv4: Device not found"); goto errout; } in_dev = __in_dev_get_rtnl_net(dev); err = -ENOBUFS; if (!in_dev) goto errout; ifa = inet_alloc_ifa(in_dev); if (!ifa) /* * A potential indev allocation can be left alive, it stays * assigned to its device and is destroy with it. */ goto errout; ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); if (!tb[IFA_ADDRESS]) tb[IFA_ADDRESS] = tb[IFA_LOCAL]; ifa->ifa_prefixlen = ifm->ifa_prefixlen; ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); ifa->ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); ifa->ifa_scope = ifm->ifa_scope; ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]); ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]); if (tb[IFA_BROADCAST]) ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]); if (tb[IFA_LABEL]) nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (tb[IFA_RT_PRIORITY]) ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]); if (tb[IFA_PROTO]) ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]); return ifa; errout: return ERR_PTR(err); } static struct in_ifaddr *find_matching_ifa(struct net *net, struct in_ifaddr *ifa) { struct in_device *in_dev = ifa->ifa_dev; struct in_ifaddr *ifa1; in_dev_for_each_ifa_rtnl_net(net, ifa1, in_dev) { if (ifa1->ifa_mask == ifa->ifa_mask && inet_ifa_match(ifa1->ifa_address, ifa) && ifa1->ifa_local == ifa->ifa_local) return ifa1; } return NULL; } static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { __u32 prefered_lft = INFINITY_LIFE_TIME; __u32 valid_lft = INFINITY_LIFE_TIME; struct net *net = sock_net(skb->sk); struct in_ifaddr *ifa_existing; struct nlattr *tb[IFA_MAX + 1]; struct in_ifaddr *ifa; int ret; ret = inet_validate_rtm(nlh, tb, extack, &valid_lft, &prefered_lft); if (ret < 0) return ret; if (!nla_get_in_addr(tb[IFA_LOCAL])) return 0; rtnl_net_lock(net); ifa = inet_rtm_to_ifa(net, nlh, tb, extack); if (IS_ERR(ifa)) { ret = PTR_ERR(ifa); goto unlock; } ifa_existing = find_matching_ifa(net, ifa); if (!ifa_existing) { /* It would be best to check for !NLM_F_CREATE here but * userspace already relies on not having to provide this. */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { ret = ip_mc_autojoin_config(net, true, ifa); if (ret < 0) { NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed"); inet_free_ifa(ifa); goto unlock; } } ret = __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, extack); } else { u32 new_metric = ifa->ifa_rt_priority; u8 new_proto = ifa->ifa_proto; inet_free_ifa(ifa); if (nlh->nlmsg_flags & NLM_F_EXCL || !(nlh->nlmsg_flags & NLM_F_REPLACE)) { NL_SET_ERR_MSG(extack, "ipv4: Address already assigned"); ret = -EEXIST; goto unlock; } ifa = ifa_existing; if (ifa->ifa_rt_priority != new_metric) { fib_modify_prefix_metric(ifa, new_metric); ifa->ifa_rt_priority = new_metric; } ifa->ifa_proto = new_proto; set_ifa_lifetime(ifa, valid_lft, prefered_lft); cancel_delayed_work(&net->ipv4.addr_chk_work); queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); } unlock: rtnl_net_unlock(net); return ret; } /* * Determine a default network mask, based on the IP address. */ static int inet_abc_len(__be32 addr) { int rc = -1; /* Something else, probably a multicast. */ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) rc = 0; else { __u32 haddr = ntohl(addr); if (IN_CLASSA(haddr)) rc = 8; else if (IN_CLASSB(haddr)) rc = 16; else if (IN_CLASSC(haddr)) rc = 24; else if (IN_CLASSE(haddr)) rc = 32; } return rc; } int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr) { struct sockaddr_in sin_orig; struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr; struct in_ifaddr __rcu **ifap = NULL; struct in_device *in_dev; struct in_ifaddr *ifa = NULL; struct net_device *dev; char *colon; int ret = -EFAULT; int tryaddrmatch = 0; ifr->ifr_name[IFNAMSIZ - 1] = 0; /* save original address for comparison */ memcpy(&sin_orig, sin, sizeof(*sin)); colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; dev_load(net, ifr->ifr_name); switch (cmd) { case SIOCGIFADDR: /* Get interface address */ case SIOCGIFBRDADDR: /* Get the broadcast address */ case SIOCGIFDSTADDR: /* Get the destination address */ case SIOCGIFNETMASK: /* Get the netmask for the interface */ /* Note that these ioctls will not sleep, so that we do not impose a lock. One day we will be forced to put shlock here (I mean SMP) */ tryaddrmatch = (sin_orig.sin_family == AF_INET); memset(sin, 0, sizeof(*sin)); sin->sin_family = AF_INET; break; case SIOCSIFFLAGS: ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; break; case SIOCSIFADDR: /* Set interface address (and family) */ case SIOCSIFBRDADDR: /* Set the broadcast address */ case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ ret = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; ret = -EINVAL; if (sin->sin_family != AF_INET) goto out; break; default: ret = -EINVAL; goto out; } rtnl_net_lock(net); ret = -ENODEV; dev = __dev_get_by_name(net, ifr->ifr_name); if (!dev) goto done; if (colon) *colon = ':'; in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { if (tryaddrmatch) { /* Matthias Andree */ /* compare label and address (4.4BSD style) */ /* note: we only do this for a limited set of ioctls and only if the original address family was AF_INET. This is checked above. */ for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) { if (!strcmp(ifr->ifr_name, ifa->ifa_label) && sin_orig.sin_addr.s_addr == ifa->ifa_local) { break; /* found */ } } } /* we didn't get a match, maybe the application is 4.3BSD-style and passed in junk so we fall back to comparing just the label */ if (!ifa) { for (ifap = &in_dev->ifa_list; (ifa = rtnl_net_dereference(net, *ifap)) != NULL; ifap = &ifa->ifa_next) if (!strcmp(ifr->ifr_name, ifa->ifa_label)) break; } } ret = -EADDRNOTAVAIL; if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) goto done; switch (cmd) { case SIOCGIFADDR: /* Get interface address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_local; break; case SIOCGIFBRDADDR: /* Get the broadcast address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_broadcast; break; case SIOCGIFDSTADDR: /* Get the destination address */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_address; break; case SIOCGIFNETMASK: /* Get the netmask for the interface */ ret = 0; sin->sin_addr.s_addr = ifa->ifa_mask; break; case SIOCSIFFLAGS: if (colon) { ret = -EADDRNOTAVAIL; if (!ifa) break; ret = 0; if (!(ifr->ifr_flags & IFF_UP)) inet_del_ifa(in_dev, ifap, 1); break; } /* NETDEV_UP/DOWN/CHANGE could touch a peer dev */ ASSERT_RTNL(); ret = dev_change_flags(dev, ifr->ifr_flags, NULL); break; case SIOCSIFADDR: /* Set interface address (and family) */ ret = -EINVAL; if (inet_abc_len(sin->sin_addr.s_addr) < 0) break; if (!ifa) { ret = -ENOBUFS; if (!in_dev) break; ifa = inet_alloc_ifa(in_dev); if (!ifa) break; if (colon) memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); } else { ret = 0; if (ifa->ifa_local == sin->sin_addr.s_addr) break; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = 0; ifa->ifa_scope = 0; } ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr; if (!(dev->flags & IFF_POINTOPOINT)) { ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address); ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); if ((dev->flags & IFF_BROADCAST) && ifa->ifa_prefixlen < 31) ifa->ifa_broadcast = ifa->ifa_address | ~ifa->ifa_mask; } else { ifa->ifa_prefixlen = 32; ifa->ifa_mask = inet_make_mask(32); } set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); ret = inet_set_ifa(dev, ifa); break; case SIOCSIFBRDADDR: /* Set the broadcast address */ ret = 0; if (ifa->ifa_broadcast != sin->sin_addr.s_addr) { inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = sin->sin_addr.s_addr; inet_insert_ifa(ifa); } break; case SIOCSIFDSTADDR: /* Set the destination address */ ret = 0; if (ifa->ifa_address == sin->sin_addr.s_addr) break; ret = -EINVAL; if (inet_abc_len(sin->sin_addr.s_addr) < 0) break; ret = 0; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_address = sin->sin_addr.s_addr; inet_insert_ifa(ifa); break; case SIOCSIFNETMASK: /* Set the netmask for the interface */ /* * The mask we set must be legal. */ ret = -EINVAL; if (bad_mask(sin->sin_addr.s_addr, 0)) break; ret = 0; if (ifa->ifa_mask != sin->sin_addr.s_addr) { __be32 old_mask = ifa->ifa_mask; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_mask = sin->sin_addr.s_addr; ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask); /* See if current broadcast address matches * with current netmask, then recalculate * the broadcast address. Otherwise it's a * funny address, so don't touch it since * the user seems to know what (s)he's doing... */ if ((dev->flags & IFF_BROADCAST) && (ifa->ifa_prefixlen < 31) && (ifa->ifa_broadcast == (ifa->ifa_local|~old_mask))) { ifa->ifa_broadcast = (ifa->ifa_local | ~sin->sin_addr.s_addr); } inet_insert_ifa(ifa); } break; } done: rtnl_net_unlock(net); out: return ret; } int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl_net(dev); const struct in_ifaddr *ifa; struct ifreq ifr; int done = 0; if (WARN_ON(size > sizeof(struct ifreq))) goto out; if (!in_dev) goto out; in_dev_for_each_ifa_rtnl_net(dev_net(dev), ifa, in_dev) { if (!buf) { done += size; continue; } if (len < size) break; memset(&ifr, 0, sizeof(struct ifreq)); strcpy(ifr.ifr_name, ifa->ifa_label); (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = ifa->ifa_local; if (copy_to_user(buf + done, &ifr, size)) { done = -EFAULT; break; } len -= size; done += size; } out: return done; } static __be32 in_dev_select_addr(const struct in_device *in_dev, int scope) { const struct in_ifaddr *ifa; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) continue; if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) return ifa->ifa_local; } return 0; } __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) { const struct in_ifaddr *ifa; __be32 addr = 0; unsigned char localnet_scope = RT_SCOPE_HOST; struct in_device *in_dev; struct net *net; int master_idx; rcu_read_lock(); net = dev_net_rcu(dev); in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto no_in_dev; if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) localnet_scope = RT_SCOPE_LINK; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) continue; if (min(ifa->ifa_scope, localnet_scope) > scope) continue; if (!dst || inet_ifa_match(dst, ifa)) { addr = ifa->ifa_local; break; } if (!addr) addr = ifa->ifa_local; } if (addr) goto out_unlock; no_in_dev: master_idx = l3mdev_master_ifindex_rcu(dev); /* For VRFs, the VRF device takes the place of the loopback device, * with addresses on it being preferred. Note in such cases the * loopback device will be among the devices that fail the master_idx * equality check in the loop below. */ if (master_idx && (dev = dev_get_by_index_rcu(net, master_idx)) && (in_dev = __in_dev_get_rcu(dev))) { addr = in_dev_select_addr(in_dev, scope); if (addr) goto out_unlock; } /* Not loopback addresses on loopback should be preferred in this case. It is important that lo is the first interface in dev_base list. */ for_each_netdev_rcu(net, dev) { if (l3mdev_master_ifindex_rcu(dev) != master_idx) continue; in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; addr = in_dev_select_addr(in_dev, scope); if (addr) goto out_unlock; } out_unlock: rcu_read_unlock(); return addr; } EXPORT_SYMBOL(inet_select_addr); static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, __be32 local, int scope) { unsigned char localnet_scope = RT_SCOPE_HOST; const struct in_ifaddr *ifa; __be32 addr = 0; int same = 0; if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev))) localnet_scope = RT_SCOPE_LINK; in_dev_for_each_ifa_rcu(ifa, in_dev) { unsigned char min_scope = min(ifa->ifa_scope, localnet_scope); if (!addr && (local == ifa->ifa_local || !local) && min_scope <= scope) { addr = ifa->ifa_local; if (same) break; } if (!same) { same = (!local || inet_ifa_match(local, ifa)) && (!dst || inet_ifa_match(dst, ifa)); if (same && addr) { if (local || !dst) break; /* Is the selected addr into dst subnet? */ if (inet_ifa_match(addr, ifa)) break; /* No, then can we use new local src? */ if (min_scope <= scope) { addr = ifa->ifa_local; break; } /* search for large dst subnet for addr */ same = 0; } } } return same ? addr : 0; } /* * Confirm that local IP address exists using wildcards: * - net: netns to check, cannot be NULL * - in_dev: only on this interface, NULL=any interface * - dst: only in the same subnet as dst, 0=any dst * - local: address, 0=autoselect the local address * - scope: maximum allowed scope value for the local address */ __be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst, __be32 local, int scope) { __be32 addr = 0; struct net_device *dev; if (in_dev) return confirm_addr_indev(in_dev, dst, local, scope); rcu_read_lock(); for_each_netdev_rcu(net, dev) { in_dev = __in_dev_get_rcu(dev); if (in_dev) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) break; } } rcu_read_unlock(); return addr; } EXPORT_SYMBOL(inet_confirm_addr); /* * Device notifier */ int register_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_chain, nb); } EXPORT_SYMBOL(register_inetaddr_notifier); int unregister_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_chain, nb); } EXPORT_SYMBOL(unregister_inetaddr_notifier); int register_inetaddr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_validator_chain, nb); } EXPORT_SYMBOL(register_inetaddr_validator_notifier); int unregister_inetaddr_validator_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_validator_chain, nb); } EXPORT_SYMBOL(unregister_inetaddr_validator_notifier); /* Rename ifa_labels for a device name change. Make some effort to preserve * existing alias numbering and to create unique labels if possible. */ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) { struct in_ifaddr *ifa; int named = 0; in_dev_for_each_ifa_rtnl(ifa, in_dev) { char old[IFNAMSIZ], *dot; memcpy(old, ifa->ifa_label, IFNAMSIZ); memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (named++ == 0) goto skip; dot = strchr(old, ':'); if (!dot) { sprintf(old, ":%d", named); dot = old; } if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) strcat(ifa->ifa_label, dot); else strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); skip: rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } static void inetdev_send_gratuitous_arp(struct net_device *dev, struct in_device *in_dev) { const struct in_ifaddr *ifa; in_dev_for_each_ifa_rtnl(ifa, in_dev) { arp_send(ARPOP_REQUEST, ETH_P_ARP, ifa->ifa_local, dev, ifa->ifa_local, NULL, dev->dev_addr, NULL); } } /* Called only under RTNL semaphore */ static int inetdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct in_device *in_dev = __in_dev_get_rtnl(dev); ASSERT_RTNL(); if (!in_dev) { if (event == NETDEV_REGISTER) { in_dev = inetdev_init(dev); if (IS_ERR(in_dev)) return notifier_from_errno(PTR_ERR(in_dev)); if (dev->flags & IFF_LOOPBACK) { IN_DEV_CONF_SET(in_dev, NOXFRM, 1); IN_DEV_CONF_SET(in_dev, NOPOLICY, 1); } } else if (event == NETDEV_CHANGEMTU) { /* Re-enabling IP */ if (inetdev_valid_mtu(dev->mtu)) in_dev = inetdev_init(dev); } goto out; } switch (event) { case NETDEV_REGISTER: pr_debug("%s: bug\n", __func__); RCU_INIT_POINTER(dev->ip_ptr, NULL); break; case NETDEV_UP: if (!inetdev_valid_mtu(dev->mtu)) break; if (dev->flags & IFF_LOOPBACK) { struct in_ifaddr *ifa = inet_alloc_ifa(in_dev); if (ifa) { ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; ifa->ifa_mask = inet_make_mask(8); ifa->ifa_scope = RT_SCOPE_HOST; memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME); ipv4_devconf_setall(in_dev); neigh_parms_data_state_setall(in_dev->arp_parms); inet_insert_ifa(ifa); } } ip_mc_up(in_dev); fallthrough; case NETDEV_CHANGEADDR: if (!IN_DEV_ARP_NOTIFY(in_dev)) break; fallthrough; case NETDEV_NOTIFY_PEERS: /* Send gratuitous ARP to notify of link change */ inetdev_send_gratuitous_arp(dev, in_dev); break; case NETDEV_DOWN: ip_mc_down(in_dev); break; case NETDEV_PRE_TYPE_CHANGE: ip_mc_unmap(in_dev); break; case NETDEV_POST_TYPE_CHANGE: ip_mc_remap(in_dev); break; case NETDEV_CHANGEMTU: if (inetdev_valid_mtu(dev->mtu)) break; /* disable IP when MTU is not enough */ fallthrough; case NETDEV_UNREGISTER: inetdev_destroy(in_dev); break; case NETDEV_CHANGENAME: /* Do not notify about label change, this event is * not interesting to applications using netlink. */ inetdev_changename(dev, in_dev); devinet_sysctl_unregister(in_dev); devinet_sysctl_register(in_dev); break; } out: return NOTIFY_DONE; } static struct notifier_block ip_netdev_notifier = { .notifier_call = inetdev_event, }; static size_t inet_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + nla_total_size(4) /* IFA_ADDRESS */ + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ + nla_total_size(4) /* IFA_FLAGS */ + nla_total_size(1) /* IFA_PROTO */ + nla_total_size(4) /* IFA_RT_PRIORITY */ + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ } static inline u32 cstamp_delta(unsigned long cstamp) { return (cstamp - INITIAL_JIFFIES) * 100UL / HZ; } static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp, unsigned long tstamp, u32 preferred, u32 valid) { struct ifa_cacheinfo ci; ci.cstamp = cstamp_delta(cstamp); ci.tstamp = cstamp_delta(tstamp); ci.ifa_prefered = preferred; ci.ifa_valid = valid; return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci); } static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa, struct inet_fill_args *args) { struct ifaddrmsg *ifm; struct nlmsghdr *nlh; unsigned long tstamp; u32 preferred, valid; u32 flags; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm), args->flags); if (!nlh) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifa_family = AF_INET; ifm->ifa_prefixlen = ifa->ifa_prefixlen; flags = READ_ONCE(ifa->ifa_flags); /* Warning : ifm->ifa_flags is an __u8, it holds only 8 bits. * The 32bit value is given in IFA_FLAGS attribute. */ ifm->ifa_flags = (__u8)flags; ifm->ifa_scope = ifa->ifa_scope; ifm->ifa_index = ifa->ifa_dev->dev->ifindex; if (args->netnsid >= 0 && nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) goto nla_put_failure; tstamp = READ_ONCE(ifa->ifa_tstamp); if (!(flags & IFA_F_PERMANENT)) { preferred = READ_ONCE(ifa->ifa_preferred_lft); valid = READ_ONCE(ifa->ifa_valid_lft); if (preferred != INFINITY_LIFE_TIME) { long tval = (jiffies - tstamp) / HZ; if (preferred > tval) preferred -= tval; else preferred = 0; if (valid != INFINITY_LIFE_TIME) { if (valid > tval) valid -= tval; else valid = 0; } } } else { preferred = INFINITY_LIFE_TIME; valid = INFINITY_LIFE_TIME; } if ((ifa->ifa_address && nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) || (ifa->ifa_local && nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) || (ifa->ifa_broadcast && nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) || (ifa->ifa_label[0] && nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) || (ifa->ifa_proto && nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) || nla_put_u32(skb, IFA_FLAGS, flags) || (ifa->ifa_rt_priority && nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) || put_cacheinfo(skb, READ_ONCE(ifa->ifa_cstamp), tstamp, preferred, valid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct inet_fill_args *fillargs, struct net **tgt_net, struct sock *sk, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[IFA_MAX+1]; struct ifaddrmsg *ifm; int err, i; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request"); return -EINVAL; } ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request"); return -EINVAL; } fillargs->ifindex = ifm->ifa_index; if (fillargs->ifindex) { cb->answer_flags |= NLM_F_DUMP_FILTERED; fillargs->flags |= NLM_F_DUMP_FILTERED; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, extack); if (err < 0) return err; for (i = 0; i <= IFA_MAX; ++i) { if (!tb[i]) continue; if (i == IFA_TARGET_NETNSID) { struct net *net; fillargs->netnsid = nla_get_s32(tb[i]); net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); if (IS_ERR(net)) { fillargs->netnsid = -1; NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id"); return PTR_ERR(net); } *tgt_net = net; } else { NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request"); return -EINVAL; } } return 0; } static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, struct netlink_callback *cb, int *s_ip_idx, struct inet_fill_args *fillargs) { struct in_ifaddr *ifa; int ip_idx = 0; int err; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (ip_idx < *s_ip_idx) { ip_idx++; continue; } err = inet_fill_ifaddr(skb, ifa, fillargs); if (err < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); ip_idx++; } err = 0; ip_idx = 0; done: *s_ip_idx = ip_idx; return err; } /* Combine dev_addr_genid and dev_base_seq to detect changes. */ static u32 inet_base_seq(const struct net *net) { u32 res = atomic_read(&net->ipv4.dev_addr_genid) + READ_ONCE(net->dev_base_seq); /* Must not return 0 (see nl_dump_check_consistent()). * Chose a value far away from 0. */ if (!res) res = 0x80000000; return res; } static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct inet_fill_args fillargs = { .portid = NETLINK_CB(cb->skb).portid, .seq = nlh->nlmsg_seq, .event = RTM_NEWADDR, .flags = NLM_F_MULTI, .netnsid = -1, }; struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct { unsigned long ifindex; int ip_idx; } *ctx = (void *)cb->ctx; struct in_device *in_dev; struct net_device *dev; int err = 0; rcu_read_lock(); if (cb->strict_check) { err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, skb->sk, cb); if (err < 0) goto done; if (fillargs.ifindex) { dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex); if (!dev) { err = -ENODEV; goto done; } in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto done; err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx, &fillargs); goto done; } } cb->seq = inet_base_seq(tgt_net); for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx, &fillargs); if (err < 0) goto done; } done: if (fillargs.netnsid >= 0) put_net(tgt_net); rcu_read_unlock(); return err; } static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid) { struct inet_fill_args fillargs = { .portid = portid, .seq = nlh ? nlh->nlmsg_seq : 0, .event = event, .flags = 0, .netnsid = -1, }; struct sk_buff *skb; int err = -ENOBUFS; struct net *net; net = dev_net(ifa->ifa_dev->dev); skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL); if (!skb) goto errout; err = inet_fill_ifaddr(skb, ifa, &fillargs); if (err < 0) { /* -EMSGSIZE implies BUG in inet_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); } static size_t inet_get_link_af_size(const struct net_device *dev, u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); if (!in_dev) return 0; return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */ } static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask) { struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr); struct nlattr *nla; int i; if (!in_dev) return -ENODATA; nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4); if (!nla) return -EMSGSIZE; for (i = 0; i < IPV4_DEVCONF_MAX; i++) ((u32 *) nla_data(nla))[i] = READ_ONCE(in_dev->cnf.data[i]); return 0; } static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = { [IFLA_INET_CONF] = { .type = NLA_NESTED }, }; static int inet_validate_link_af(const struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; if (dev && !__in_dev_get_rtnl(dev)) return -EAFNOSUPPORT; err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, inet_af_policy, extack); if (err < 0) return err; if (tb[IFLA_INET_CONF]) { nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) { int cfgid = nla_type(a); if (nla_len(a) < 4) return -EINVAL; if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX) return -EINVAL; } } return 0; } static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct in_device *in_dev = __in_dev_get_rtnl(dev); struct nlattr *a, *tb[IFLA_INET_MAX+1]; int rem; if (!in_dev) return -EAFNOSUPPORT; if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0) return -EINVAL; if (tb[IFLA_INET_CONF]) { nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a)); } return 0; } static int inet_netconf_msgsize_devconf(int type) { int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + nla_total_size(4); /* NETCONFA_IFINDEX */ bool all = false; if (type == NETCONFA_ALL) all = true; if (all || type == NETCONFA_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_RP_FILTER) size += nla_total_size(4); if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_BC_FORWARDING) size += nla_total_size(4); if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) size += nla_total_size(4); return size; } static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, const struct ipv4_devconf *devconf, u32 portid, u32 seq, int event, unsigned int flags, int type) { struct nlmsghdr *nlh; struct netconfmsg *ncm; bool all = false; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); if (!nlh) return -EMSGSIZE; if (type == NETCONFA_ALL) all = true; ncm = nlmsg_data(nlh); ncm->ncm_family = AF_INET; if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) goto nla_put_failure; if (!devconf) goto out; if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, IPV4_DEVCONF_RO(*devconf, FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_RP_FILTER) && nla_put_s32(skb, NETCONFA_RP_FILTER, IPV4_DEVCONF_RO(*devconf, RP_FILTER)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF_RO(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_BC_FORWARDING) && nla_put_s32(skb, NETCONFA_BC_FORWARDING, IPV4_DEVCONF_RO(*devconf, BC_FORWARDING)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF_RO(*devconf, PROXY_ARP)) < 0) goto nla_put_failure; if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, IPV4_DEVCONF_RO(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) goto nla_put_failure; out: nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } void inet_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv4_devconf *devconf) { struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_KERNEL); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, event, 0, type); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err); } static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_RP_FILTER] = { .len = sizeof(int) }, [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; static int inet_netconf_valid_get_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_ipv4_policy, extack); err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg), tb, NETCONFA_MAX, devconf_ipv4_policy, extack); if (err) return err; for (i = 0; i <= NETCONFA_MAX; i++) { if (!tb[i]) continue; switch (i) { case NETCONFA_IFINDEX: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request"); return -EINVAL; } } return 0; } static int inet_netconf_get_devconf(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX + 1]; const struct ipv4_devconf *devconf; struct in_device *in_dev = NULL; struct net_device *dev = NULL; struct sk_buff *skb; int ifindex; int err; err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack); if (err) return err; if (!tb[NETCONFA_IFINDEX]) return -EINVAL; ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); switch (ifindex) { case NETCONFA_IFINDEX_ALL: devconf = net->ipv4.devconf_all; break; case NETCONFA_IFINDEX_DEFAULT: devconf = net->ipv4.devconf_dflt; break; default: err = -ENODEV; dev = dev_get_by_index(net, ifindex); if (dev) in_dev = in_dev_get(dev); if (!in_dev) goto errout; devconf = &in_dev->cnf; break; } err = -ENOBUFS; skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, 0, NETCONFA_ALL); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: if (in_dev) in_dev_put(in_dev); dev_put(dev); return err; } static int inet_netconf_dump_devconf(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct { unsigned long ifindex; unsigned int all_default; } *ctx = (void *)cb->ctx; const struct in_device *in_dev; struct net_device *dev; int err = 0; if (cb->strict_check) { struct netlink_ext_ack *extack = cb->extack; struct netconfmsg *ncm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ncm))) { NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request"); return -EINVAL; } } rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->ifindex) { in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; err = inet_netconf_fill_devconf(skb, dev->ifindex, &in_dev->cnf, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; } if (ctx->all_default == 0) { err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; ctx->all_default++; } if (ctx->all_default == 1) { err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, NETCONFA_ALL); if (err < 0) goto done; ctx->all_default++; } done: rcu_read_unlock(); return err; } #ifdef CONFIG_SYSCTL static void devinet_copy_dflt_conf(struct net *net, int i) { struct net_device *dev; rcu_read_lock(); for_each_netdev_rcu(net, dev) { struct in_device *in_dev; in_dev = __in_dev_get_rcu(dev); if (in_dev && !test_bit(i, in_dev->cnf.state)) in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; } rcu_read_unlock(); } /* called with RTNL locked */ static void inet_forward_change(struct net *net) { struct net_device *dev; int on = IPV4_DEVCONF_ALL(net, FORWARDING); IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; IPV4_DEVCONF_DFLT(net, FORWARDING) = on; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); for_each_netdev(net, dev) { struct in_device *in_dev; if (on) dev_disable_lro(dev); in_dev = __in_dev_get_rtnl_net(dev); if (in_dev) { IN_DEV_CONF_SET(in_dev, FORWARDING, on); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, dev->ifindex, &in_dev->cnf); } } } static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) { if (cnf == net->ipv4.devconf_dflt) return NETCONFA_IFINDEX_DEFAULT; else if (cnf == net->ipv4.devconf_all) return NETCONFA_IFINDEX_ALL; else { struct in_device *idev = container_of(cnf, struct in_device, cnf); return idev->dev->ifindex; } } static int devinet_conf_proc(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int old_value = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); int new_value = *(int *)ctl->data; if (write) { struct ipv4_devconf *cnf = ctl->extra1; struct net *net = ctl->extra2; int i = (int *)ctl->data - cnf->data; int ifindex; set_bit(i, cnf->state); if (cnf == net->ipv4.devconf_dflt) devinet_copy_dflt_conf(net, i); if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) if ((new_value == 0) && (old_value != 0)) rt_cache_flush(net); if (i == IPV4_DEVCONF_BC_FORWARDING - 1 && new_value != old_value) rt_cache_flush(net); if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_RP_FILTER, ifindex, cnf); } if (i == IPV4_DEVCONF_PROXY_ARP - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_PROXY_NEIGH, ifindex, cnf); } if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, ifindex, cnf); } } return ret; } static int devinet_sysctl_forward(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; loff_t pos = *ppos; struct net *net = ctl->extra2; int ret; if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *valp != val) { if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { if (!rtnl_net_trylock(net)) { /* Restore the original values before restarting */ *valp = val; *ppos = pos; return restart_syscall(); } if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { inet_forward_change(net); } else { struct ipv4_devconf *cnf = ctl->extra1; struct in_device *idev = container_of(cnf, struct in_device, cnf); if (*valp) dev_disable_lro(idev->dev); inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, idev->dev->ifindex, cnf); } rtnl_net_unlock(net); rt_cache_flush(net); } else inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); } return ret; } static int ipv4_doint_and_flush(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); struct net *net = ctl->extra2; if (write && *valp != val) rt_cache_flush(net); return ret; } #define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \ { \ .procname = name, \ .data = ipv4_devconf.data + \ IPV4_DEVCONF_ ## attr - 1, \ .maxlen = sizeof(int), \ .mode = mval, \ .proc_handler = proc, \ .extra1 = &ipv4_devconf, \ } #define DEVINET_SYSCTL_RW_ENTRY(attr, name) \ DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc) #define DEVINET_SYSCTL_RO_ENTRY(attr, name) \ DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc) #define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \ DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc) #define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \ DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush) static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table devinet_vars[IPV4_DEVCONF_MAX]; } devinet_sysctl = { .devinet_vars = { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", devinet_sysctl_forward), DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"), DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"), DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"), DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, "accept_source_route"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"), DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"), DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"), DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"), DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"), DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), DEVINET_SYSCTL_RW_ENTRY(ARP_EVICT_NOCARRIER, "arp_evict_nocarrier"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION, "force_igmp_version"), DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL, "igmpv2_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL, "igmpv3_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, "ignore_routes_with_linkdown"), DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP, "drop_gratuitous_arp"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES, "promote_secondaries"), DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, "route_localnet"), DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST, "drop_unicast_in_l2_multicast"), }, }; static int __devinet_sysctl_register(struct net *net, char *dev_name, int ifindex, struct ipv4_devconf *p) { int i; struct devinet_sysctl_table *t; char path[sizeof("net/ipv4/conf/") + IFNAMSIZ]; t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL_ACCOUNT); if (!t) goto out; for (i = 0; i < ARRAY_SIZE(t->devinet_vars); i++) { t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; t->devinet_vars[i].extra1 = p; t->devinet_vars[i].extra2 = net; } snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name); t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars); if (!t->sysctl_header) goto free; p->sysctl = t; inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL, ifindex, p); return 0; free: kfree(t); out: return -ENOMEM; } static void __devinet_sysctl_unregister(struct net *net, struct ipv4_devconf *cnf, int ifindex) { struct devinet_sysctl_table *t = cnf->sysctl; if (t) { cnf->sysctl = NULL; unregister_net_sysctl_table(t->sysctl_header); kfree(t); } inet_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL); } static int devinet_sysctl_register(struct in_device *idev) { int err; if (!sysctl_dev_name_is_allowed(idev->dev->name)) return -EINVAL; err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL); if (err) return err; err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, idev->dev->ifindex, &idev->cnf); if (err) neigh_sysctl_unregister(idev->arp_parms); return err; } static void devinet_sysctl_unregister(struct in_device *idev) { struct net *net = dev_net(idev->dev); __devinet_sysctl_unregister(net, &idev->cnf, idev->dev->ifindex); neigh_sysctl_unregister(idev->arp_parms); } static struct ctl_table ctl_forward_entry[] = { { .procname = "ip_forward", .data = &ipv4_devconf.data[ IPV4_DEVCONF_FORWARDING - 1], .maxlen = sizeof(int), .mode = 0644, .proc_handler = devinet_sysctl_forward, .extra1 = &ipv4_devconf, .extra2 = &init_net, }, }; #endif static __net_init int devinet_init_net(struct net *net) { #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table *tbl; #endif struct ipv4_devconf *all, *dflt; int err; int i; err = -ENOMEM; net->ipv4.inet_addr_lst = kmalloc_array(IN4_ADDR_HSIZE, sizeof(struct hlist_head), GFP_KERNEL); if (!net->ipv4.inet_addr_lst) goto err_alloc_hash; all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL); if (!all) goto err_alloc_all; dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL); if (!dflt) goto err_alloc_dflt; #ifdef CONFIG_SYSCTL tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL); if (!tbl) goto err_alloc_ctl; tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; tbl[0].extra1 = all; tbl[0].extra2 = net; #endif if (!net_eq(net, &init_net)) { switch (net_inherit_devconf()) { case 3: /* copy from the current netns */ memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, current->nsproxy->net_ns->ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); break; case 0: case 1: /* copy from init_net */ memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); break; case 2: /* use compiled values */ break; } } #ifdef CONFIG_SYSCTL err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all); if (err < 0) goto err_reg_all; err = __devinet_sysctl_register(net, "default", NETCONFA_IFINDEX_DEFAULT, dflt); if (err < 0) goto err_reg_dflt; err = -ENOMEM; forw_hdr = register_net_sysctl_sz(net, "net/ipv4", tbl, ARRAY_SIZE(ctl_forward_entry)); if (!forw_hdr) goto err_reg_ctl; net->ipv4.forw_hdr = forw_hdr; #endif for (i = 0; i < IN4_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]); INIT_DEFERRABLE_WORK(&net->ipv4.addr_chk_work, check_lifetime); net->ipv4.devconf_all = all; net->ipv4.devconf_dflt = dflt; return 0; #ifdef CONFIG_SYSCTL err_reg_ctl: __devinet_sysctl_unregister(net, dflt, NETCONFA_IFINDEX_DEFAULT); err_reg_dflt: __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: kfree(tbl); err_alloc_ctl: #endif kfree(dflt); err_alloc_dflt: kfree(all); err_alloc_all: kfree(net->ipv4.inet_addr_lst); err_alloc_hash: return err; } static __net_exit void devinet_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL const struct ctl_table *tbl; #endif cancel_delayed_work_sync(&net->ipv4.addr_chk_work); #ifdef CONFIG_SYSCTL tbl = net->ipv4.forw_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.forw_hdr); __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt, NETCONFA_IFINDEX_DEFAULT); __devinet_sysctl_unregister(net, net->ipv4.devconf_all, NETCONFA_IFINDEX_ALL); kfree(tbl); #endif kfree(net->ipv4.devconf_dflt); kfree(net->ipv4.devconf_all); kfree(net->ipv4.inet_addr_lst); } static __net_initdata struct pernet_operations devinet_ops = { .init = devinet_init_net, .exit = devinet_exit_net, }; static struct rtnl_af_ops inet_af_ops __read_mostly = { .family = AF_INET, .fill_link_af = inet_fill_link_af, .get_link_af_size = inet_get_link_af_size, .validate_link_af = inet_validate_link_af, .set_link_af = inet_set_link_af, }; static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, {.protocol = PF_INET, .msgtype = RTM_GETNETCONF, .doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, }; void __init devinet_init(void) { register_pernet_subsys(&devinet_ops); register_netdevice_notifier(&ip_netdev_notifier); if (rtnl_af_register(&inet_af_ops)) panic("Unable to register inet_af_ops\n"); rtnl_register_many(devinet_rtnl_msg_handlers); } |
6 6 1 5 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 | // SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Manage send buffer. * Producer: * Copy user space data into send buffer, if send buffer space available. * Consumer: * Trigger RDMA write into RMBE of peer and send CDC, if RMBE space available. * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/workqueue.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/tcp.h> #include "smc.h" #include "smc_wr.h" #include "smc_cdc.h" #include "smc_close.h" #include "smc_ism.h" #include "smc_tx.h" #include "smc_stats.h" #include "smc_tracepoint.h" #define SMC_TX_WORK_DELAY 0 /***************************** sndbuf producer *******************************/ /* callback implementation for sk.sk_write_space() * to wakeup sndbuf producers that blocked with smc_tx_wait(). * called under sk_socket lock. */ static void smc_tx_write_space(struct sock *sk) { struct socket *sock = sk->sk_socket; struct smc_sock *smc = smc_sk(sk); struct socket_wq *wq; /* similar to sk_stream_write_space */ if (atomic_read(&smc->conn.sndbuf_space) && sock) { if (test_bit(SOCK_NOSPACE, &sock->flags)) SMC_STAT_RMB_TX_FULL(smc, !smc->conn.lnk); clear_bit(SOCK_NOSPACE, &sock->flags); rcu_read_lock(); wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible_poll(&wq->wait, EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT); rcu_read_unlock(); } } /* Wakeup sndbuf producers that blocked with smc_tx_wait(). * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space(). */ void smc_tx_sndbuf_nonfull(struct smc_sock *smc) { if (smc->sk.sk_socket && test_bit(SOCK_NOSPACE, &smc->sk.sk_socket->flags)) smc->sk.sk_write_space(&smc->sk); } /* blocks sndbuf producer until at least one byte of free space available * or urgent Byte was consumed */ static int smc_tx_wait(struct smc_sock *smc, int flags) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; long timeo; int rc = 0; /* similar to sk_stream_wait_memory */ timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); add_wait_queue(sk_sleep(sk), &wait); while (1) { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || conn->killed || conn->local_tx_ctrl.conn_state_flags.peer_done_writing) { rc = -EPIPE; break; } if (smc_cdc_rxed_any_close(conn)) { rc = -ECONNRESET; break; } if (!timeo) { /* ensure EPOLLOUT is subsequently generated */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); rc = -EAGAIN; break; } if (signal_pending(current)) { rc = sock_intr_errno(timeo); break; } sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend) break; /* at least 1 byte of free & no urgent data */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk_wait_event(sk, &timeo, READ_ONCE(sk->sk_err) || (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) || smc_cdc_rxed_any_close(conn) || (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend), &wait); } remove_wait_queue(sk_sleep(sk), &wait); return rc; } static bool smc_tx_is_corked(struct smc_sock *smc) { struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; } /* If we have pending CDC messages, do not send: * Because CQE of this CDC message will happen shortly, it gives * a chance to coalesce future sendmsg() payload in to one RDMA Write, * without need for a timer, and with no latency trade off. * Algorithm here: * 1. First message should never cork * 2. If we have pending Tx CDC messages, wait for the first CDC * message's completion * 3. Don't cork to much data in a single RDMA Write to prevent burst * traffic, total corked message should not exceed sendbuf/2 */ static bool smc_should_autocork(struct smc_sock *smc) { struct smc_connection *conn = &smc->conn; int corking_size; corking_size = min_t(unsigned int, conn->sndbuf_desc->len >> 1, sock_net(&smc->sk)->smc.sysctl_autocorking_size); if (atomic_read(&conn->cdc_pend_tx_wr) == 0 || smc_tx_prepared_sends(conn) > corking_size) return false; return true; } static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) { struct smc_connection *conn = &smc->conn; if (smc_should_autocork(smc)) return true; /* for a corked socket defer the RDMA writes if * sndbuf_space is still available. The applications * should known how/when to uncork it. */ if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && atomic_read(&conn->sndbuf_space)) return true; return false; } /* sndbuf producer: main API called by socket layer. * called under sock lock. */ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) { size_t copylen, send_done = 0, send_remaining = len; size_t chunk_len, chunk_off, chunk_len_sum; struct smc_connection *conn = &smc->conn; union smc_host_cursor prep; struct sock *sk = &smc->sk; char *sndbuf_base; int tx_cnt_prep; int writespace; int rc, chunk; /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { rc = -EPIPE; goto out_err; } if (sk->sk_state == SMC_INIT) return -ENOTCONN; if (len > conn->sndbuf_desc->len) SMC_STAT_RMB_TX_SIZE_SMALL(smc, !conn->lnk); if (len > conn->peer_rmbe_size) SMC_STAT_RMB_TX_PEER_SIZE_SMALL(smc, !conn->lnk); if (msg->msg_flags & MSG_OOB) SMC_STAT_INC(smc, urg_data_cnt); while (msg_data_left(msg)) { if (smc->sk.sk_shutdown & SEND_SHUTDOWN || (smc->sk.sk_err == ECONNABORTED) || conn->killed) return -EPIPE; if (smc_cdc_rxed_any_close(conn)) return send_done ?: -ECONNRESET; if (msg->msg_flags & MSG_OOB) conn->local_tx_ctrl.prod_flags.urg_data_pending = 1; if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) { if (send_done) return send_done; rc = smc_tx_wait(smc, msg->msg_flags); if (rc) goto out_err; continue; } /* initialize variables for 1st iteration of subsequent loop */ /* could be just 1 byte, even after smc_tx_wait above */ writespace = atomic_read(&conn->sndbuf_space); /* not more than what user space asked for */ copylen = min_t(size_t, send_remaining, writespace); /* determine start of sndbuf */ sndbuf_base = conn->sndbuf_desc->cpu_addr; smc_curs_copy(&prep, &conn->tx_curs_prep, conn); tx_cnt_prep = prep.count; /* determine chunks where to write into sndbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len - tx_cnt_prep); chunk_len_sum = chunk_len; chunk_off = tx_cnt_prep; for (chunk = 0; chunk < 2; chunk++) { rc = memcpy_from_msg(sndbuf_base + chunk_off, msg, chunk_len); if (rc) { smc_sndbuf_sync_sg_for_device(conn); if (send_done) return send_done; goto out_err; } send_done += chunk_len; send_remaining -= chunk_len; if (chunk_len_sum == copylen) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ chunk_len = copylen - chunk_len; /* remainder */ chunk_len_sum += chunk_len; chunk_off = 0; /* modulo offset in send ring buffer */ } smc_sndbuf_sync_sg_for_device(conn); /* update cursors */ smc_curs_add(conn->sndbuf_desc->len, &prep, copylen); smc_curs_copy(&conn->tx_curs_prep, &prep, conn); /* increased in send tasklet smc_cdc_tx_handler() */ smp_mb__before_atomic(); atomic_sub(copylen, &conn->sndbuf_space); /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); /* since we just produced more new data into sndbuf, * trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; /* If we need to cork, do nothing and wait for the next * sendmsg() call or push on tx completion */ if (!smc_tx_should_cork(smc, msg)) smc_tx_sndbuf_nonempty(conn); trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ return send_done; out_err: rc = sk_stream_error(sk, msg->msg_flags, rc); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(rc == -EAGAIN)) sk->sk_write_space(sk); return rc; } /***************************** sndbuf consumer *******************************/ /* sndbuf consumer: actual data transfer of one target chunk with ISM write */ int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, u32 offset, int signal) { int rc; rc = smc_ism_write(conn->lgr->smcd, conn->peer_token, conn->peer_rmbe_idx, signal, conn->tx_off + offset, data, len); if (rc) conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; return rc; } /* sndbuf consumer: actual data transfer of one target chunk with RDMA write */ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, int num_sges, struct ib_rdma_wr *rdma_wr) { struct smc_link_group *lgr = conn->lgr; struct smc_link *link = conn->lnk; int rc; rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link); rdma_wr->wr.num_sge = num_sges; rdma_wr->remote_addr = lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr + /* RMBE within RMB */ conn->tx_off + /* offset within RMBE */ peer_rmbe_offset; rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) smcr_link_down_cond_sched(link); return rc; } /* sndbuf consumer */ static inline void smc_tx_advance_cursors(struct smc_connection *conn, union smc_host_cursor *prod, union smc_host_cursor *sent, size_t len) { smc_curs_add(conn->peer_rmbe_size, prod, len); /* increased in recv tasklet smc_cdc_msg_rcv() */ smp_mb__before_atomic(); /* data in flight reduces usable snd_wnd */ atomic_sub(len, &conn->peer_rmbe_space); /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ smp_mb__after_atomic(); smc_curs_add(conn->sndbuf_desc->len, sent, len); } /* SMC-R helper for smc_tx_rdma_writes() */ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, size_t src_off, size_t src_len, size_t dst_off, size_t dst_len, struct smc_rdma_wr *wr_rdma_buf) { struct smc_link *link = conn->lnk; dma_addr_t dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); u64 virt_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr; int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; int num_sges; int rc; for (dstchunk = 0; dstchunk < 2; dstchunk++) { struct ib_rdma_wr *wr = &wr_rdma_buf->wr_tx_rdma[dstchunk]; struct ib_sge *sge = wr->wr.sg_list; u64 base_addr = dma_addr; if (dst_len < link->qp_attr.cap.max_inline_data) { base_addr = virt_addr; wr->wr.send_flags |= IB_SEND_INLINE; } else { wr->wr.send_flags &= ~IB_SEND_INLINE; } num_sges = 0; for (srcchunk = 0; srcchunk < 2; srcchunk++) { sge[srcchunk].addr = conn->sndbuf_desc->is_vm ? (virt_addr + src_off) : (base_addr + src_off); sge[srcchunk].length = src_len; if (conn->sndbuf_desc->is_vm) sge[srcchunk].lkey = conn->sndbuf_desc->mr[link->link_idx]->lkey; num_sges++; src_off += src_len; if (src_off >= conn->sndbuf_desc->len) src_off -= conn->sndbuf_desc->len; /* modulo in send ring */ if (src_len_sum == dst_len) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ src_len = dst_len - src_len; /* remainder */ src_len_sum += src_len; } rc = smc_tx_rdma_write(conn, dst_off, num_sges, wr); if (rc) return rc; if (dst_len_sum == len) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ dst_off = 0; /* modulo offset in RMBE ring buffer */ dst_len = len - dst_len; /* remainder */ dst_len_sum += dst_len; src_len = min_t(int, dst_len, conn->sndbuf_desc->len - sent_count); src_len_sum = src_len; } return 0; } /* SMC-D helper for smc_tx_rdma_writes() */ static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len, size_t src_off, size_t src_len, size_t dst_off, size_t dst_len) { int src_len_sum = src_len, dst_len_sum = dst_len; int srcchunk, dstchunk; int rc; for (dstchunk = 0; dstchunk < 2; dstchunk++) { for (srcchunk = 0; srcchunk < 2; srcchunk++) { void *data = conn->sndbuf_desc->cpu_addr + src_off; rc = smcd_tx_ism_write(conn, data, src_len, dst_off + sizeof(struct smcd_cdc_msg), 0); if (rc) return rc; dst_off += src_len; src_off += src_len; if (src_off >= conn->sndbuf_desc->len) src_off -= conn->sndbuf_desc->len; /* modulo in send ring */ if (src_len_sum == dst_len) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ src_len = dst_len - src_len; /* remainder */ src_len_sum += src_len; } if (dst_len_sum == len) break; /* either on 1st or 2nd iteration */ /* prepare next (== 2nd) iteration */ dst_off = 0; /* modulo offset in RMBE ring buffer */ dst_len = len - dst_len; /* remainder */ dst_len_sum += dst_len; src_len = min_t(int, dst_len, conn->sndbuf_desc->len - src_off); src_len_sum = src_len; } return 0; } /* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; * usable snd_wnd as max transmit */ static int smc_tx_rdma_writes(struct smc_connection *conn, struct smc_rdma_wr *wr_rdma_buf) { size_t len, src_len, dst_off, dst_len; /* current chunk values */ union smc_host_cursor sent, prep, prod, cons; struct smc_cdc_producer_flags *pflags; int to_send, rmbespace; int rc; /* source: sndbuf */ smc_curs_copy(&sent, &conn->tx_curs_sent, conn); smc_curs_copy(&prep, &conn->tx_curs_prep, conn); /* cf. wmem_alloc - (snd_max - snd_una) */ to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); if (to_send <= 0) return 0; /* destination: RMBE */ /* cf. snd_wnd */ rmbespace = atomic_read(&conn->peer_rmbe_space); if (rmbespace <= 0) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); return 0; } smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); /* if usable snd_wnd closes ask peer to advertise once it opens again */ pflags = &conn->local_tx_ctrl.prod_flags; pflags->write_blocked = (to_send >= rmbespace); /* cf. usable snd_wnd */ len = min(to_send, rmbespace); /* initialize variables for first iteration of subsequent nested loop */ dst_off = prod.count; if (prod.wrap == cons.wrap) { /* the filled destination area is unwrapped, * hence the available free destination space is wrapped * and we need 2 destination chunks of sum len; start with 1st * which is limited by what's available in sndbuf */ dst_len = min_t(size_t, conn->peer_rmbe_size - prod.count, len); } else { /* the filled destination area is wrapped, * hence the available free destination space is unwrapped * and we need a single destination chunk of entire len */ dst_len = len; } /* dst_len determines the maximum src_len */ if (sent.count + dst_len <= conn->sndbuf_desc->len) { /* unwrapped src case: single chunk of entire dst_len */ src_len = dst_len; } else { /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ src_len = conn->sndbuf_desc->len - sent.count; } if (conn->lgr->is_smcd) rc = smcd_tx_rdma_writes(conn, len, sent.count, src_len, dst_off, dst_len); else rc = smcr_tx_rdma_writes(conn, len, sent.count, src_len, dst_off, dst_len, wr_rdma_buf); if (rc) return rc; if (conn->urg_tx_pend && len == to_send) pflags->urg_data_present = 1; smc_tx_advance_cursors(conn, &prod, &sent, len); /* update connection's cursors with advanced local cursors */ smc_curs_copy(&conn->local_tx_ctrl.prod, &prod, conn); /* dst: peer RMBE */ smc_curs_copy(&conn->tx_curs_sent, &sent, conn);/* src: local sndbuf */ return 0; } /* Wakeup sndbuf consumers from any context (IRQ or process) * since there is more data to transmit; usable snd_wnd as max transmit */ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; struct smc_link *link = conn->lnk; struct smc_rdma_wr *wr_rdma_buf; struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; int rc; if (!link || !smc_wr_tx_link_hold(link)) return -ENOLINK; rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); if (rc < 0) { smc_wr_tx_link_put(link); if (rc == -EBUSY) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); if (smc->sk.sk_err == ECONNABORTED) return sock_error(&smc->sk); if (conn->killed) return -EPIPE; rc = 0; mod_delayed_work(conn->lgr->tx_wq, &conn->tx_work, SMC_TX_WORK_DELAY); } return rc; } spin_lock_bh(&conn->send_lock); if (link != conn->lnk) { /* link of connection changed, tx_work will restart */ smc_wr_tx_put_slot(link, (struct smc_wr_tx_pend_priv *)pend); rc = -ENOLINK; goto out_unlock; } if (!pflags->urg_data_present) { rc = smc_tx_rdma_writes(conn, wr_rdma_buf); if (rc) { smc_wr_tx_put_slot(link, (struct smc_wr_tx_pend_priv *)pend); goto out_unlock; } } rc = smc_cdc_msg_send(conn, wr_buf, pend); if (!rc && pflags->urg_data_present) { pflags->urg_data_pending = 0; pflags->urg_data_present = 0; } out_unlock: spin_unlock_bh(&conn->send_lock); smc_wr_tx_link_put(link); return rc; } static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; int rc = 0; spin_lock_bh(&conn->send_lock); if (!pflags->urg_data_present) rc = smc_tx_rdma_writes(conn, NULL); if (!rc) rc = smcd_cdc_msg_send(conn); if (!rc && pflags->urg_data_present) { pflags->urg_data_pending = 0; pflags->urg_data_present = 0; } spin_unlock_bh(&conn->send_lock); return rc; } int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); int rc = 0; /* No data in the send queue */ if (unlikely(smc_tx_prepared_sends(conn) <= 0)) goto out; /* Peer don't have RMBE space */ if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) { SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); goto out; } if (conn->killed || conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { rc = -EPIPE; /* connection being aborted */ goto out; } if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else rc = smcr_tx_sndbuf_nonempty(conn); if (!rc) { /* trigger socket release if connection is closing */ smc_close_wake_tx_prepared(smc); } out: return rc; } /* Wakeup sndbuf consumers from process context * since there is more data to transmit. The caller * must hold sock lock. */ void smc_tx_pending(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); int rc; if (smc->sk.sk_err) return; rc = smc_tx_sndbuf_nonempty(conn); if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked && !atomic_read(&conn->bytes_to_rcv)) conn->local_rx_ctrl.prod_flags.write_blocked = 0; } /* Wakeup sndbuf consumers from process context * since there is more data to transmit in locked * sock. */ void smc_tx_work(struct work_struct *work) { struct smc_connection *conn = container_of(to_delayed_work(work), struct smc_connection, tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); lock_sock(&smc->sk); smc_tx_pending(conn); release_sock(&smc->sk); } void smc_tx_consumer_update(struct smc_connection *conn, bool force) { union smc_host_cursor cfed, cons, prod; int sender_free = conn->rmb_desc->len; int to_confirm; smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); smc_curs_copy(&cfed, &conn->rx_curs_confirmed, conn); to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons); if (to_confirm > conn->rmbe_update_limit) { smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn); sender_free = conn->rmb_desc->len - smc_curs_diff_large(conn->rmb_desc->len, &cfed, &prod); } if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || force || ((to_confirm > conn->rmbe_update_limit) && ((sender_free <= (conn->rmb_desc->len / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { if (conn->killed || conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) return; if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && !conn->killed) { queue_delayed_work(conn->lgr->tx_wq, &conn->tx_work, SMC_TX_WORK_DELAY); return; } } if (conn->local_rx_ctrl.prod_flags.write_blocked && !atomic_read(&conn->bytes_to_rcv)) conn->local_rx_ctrl.prod_flags.write_blocked = 0; } /***************************** send initialize *******************************/ /* Initialize send properties on connection establishment. NB: not __init! */ void smc_tx_init(struct smc_sock *smc) { smc->sk.sk_write_space = smc_tx_write_space; } |
2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 | // SPDX-License-Identifier: GPL-2.0-or-later /* * A driver for the AverMedia MR 800 USB FM radio. This device plugs * into both the USB and an analog audio input, so this thing * only deals with initialization and frequency setting, the * audio data has to be handled by a sound driver. * * Copyright (c) 2008 Alexey Klimov <klimov.linux@gmail.com> */ /* * Big thanks to authors and contributors of dsbr100.c and radio-si470x.c * * When work was looked pretty good, i discover this: * http://av-usbradio.sourceforge.net/index.php * http://sourceforge.net/projects/av-usbradio/ * Latest release of theirs project was in 2005. * Probably, this driver could be improved through using their * achievements (specifications given). * Also, Faidon Liambotis <paravoid@debian.org> wrote nice driver for this radio * in 2007. He allowed to use his driver to improve current mr800 radio driver. * http://www.spinics.net/lists/linux-usb-devel/msg10109.html * * Version 0.01: First working version. * It's required to blacklist AverMedia USB Radio * in usbhid/hid-quirks.c * Version 0.10: A lot of cleanups and fixes: unpluging the device, * few mutex locks were added, codinstyle issues, etc. * Added stereo support. Thanks to * Douglas Schilling Landgraf <dougsland@gmail.com> and * David Ellingsworth <david@identd.dyndns.org> * for discussion, help and support. * Version 0.11: Converted to v4l2_device. * * Many things to do: * - Correct power management of device (suspend & resume) * - Add code for scanning and smooth tuning * - Add code for sensitivity value * - Correct mistakes * - In Japan another FREQ_MIN and FREQ_MAX */ /* kernel includes */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/input.h> #include <linux/videodev2.h> #include <media/v4l2-device.h> #include <media/v4l2-ioctl.h> #include <media/v4l2-ctrls.h> #include <media/v4l2-event.h> #include <linux/usb.h> #include <linux/mutex.h> /* driver and module definitions */ #define DRIVER_AUTHOR "Alexey Klimov <klimov.linux@gmail.com>" #define DRIVER_DESC "AverMedia MR 800 USB FM radio driver" #define DRIVER_VERSION "0.1.2" MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); MODULE_VERSION(DRIVER_VERSION); #define USB_AMRADIO_VENDOR 0x07ca #define USB_AMRADIO_PRODUCT 0xb800 /* dev_warn macro with driver name */ #define MR800_DRIVER_NAME "radio-mr800" #define amradio_dev_warn(dev, fmt, arg...) \ dev_warn(dev, MR800_DRIVER_NAME " - " fmt, ##arg) #define amradio_dev_err(dev, fmt, arg...) \ dev_err(dev, MR800_DRIVER_NAME " - " fmt, ##arg) /* Probably USB_TIMEOUT should be modified in module parameter */ #define BUFFER_LENGTH 8 #define USB_TIMEOUT 500 /* Frequency limits in MHz -- these are European values. For Japanese devices, that would be 76 and 91. */ #define FREQ_MIN 87.5 #define FREQ_MAX 108.0 #define FREQ_MUL 16000 /* * Commands that device should understand * List isn't full and will be updated with implementation of new functions */ #define AMRADIO_SET_FREQ 0xa4 #define AMRADIO_GET_READY_FLAG 0xa5 #define AMRADIO_GET_SIGNAL 0xa7 #define AMRADIO_GET_FREQ 0xa8 #define AMRADIO_SET_SEARCH_UP 0xa9 #define AMRADIO_SET_SEARCH_DOWN 0xaa #define AMRADIO_SET_MUTE 0xab #define AMRADIO_SET_RIGHT_MUTE 0xac #define AMRADIO_SET_LEFT_MUTE 0xad #define AMRADIO_SET_MONO 0xae #define AMRADIO_SET_SEARCH_LVL 0xb0 #define AMRADIO_STOP_SEARCH 0xb1 /* Comfortable defines for amradio_set_stereo */ #define WANT_STEREO 0x00 #define WANT_MONO 0x01 /* module parameter */ static int radio_nr = -1; module_param(radio_nr, int, 0); MODULE_PARM_DESC(radio_nr, "Radio Nr"); /* Data for one (physical) device */ struct amradio_device { /* reference to USB and video device */ struct usb_device *usbdev; struct usb_interface *intf; struct video_device vdev; struct v4l2_device v4l2_dev; struct v4l2_ctrl_handler hdl; u8 *buffer; struct mutex lock; /* buffer locking */ int curfreq; int stereo; int muted; }; static inline struct amradio_device *to_amradio_dev(struct v4l2_device *v4l2_dev) { return container_of(v4l2_dev, struct amradio_device, v4l2_dev); } static int amradio_send_cmd(struct amradio_device *radio, u8 cmd, u8 arg, u8 *extra, u8 extralen, bool reply) { int retval; int size; radio->buffer[0] = 0x00; radio->buffer[1] = 0x55; radio->buffer[2] = 0xaa; radio->buffer[3] = extralen; radio->buffer[4] = cmd; radio->buffer[5] = arg; radio->buffer[6] = 0x00; radio->buffer[7] = extra || reply ? 8 : 0; retval = usb_bulk_msg(radio->usbdev, usb_sndintpipe(radio->usbdev, 2), radio->buffer, BUFFER_LENGTH, &size, USB_TIMEOUT); if (retval < 0 || size != BUFFER_LENGTH) { if (video_is_registered(&radio->vdev)) amradio_dev_warn(&radio->vdev.dev, "cmd %02x failed\n", cmd); return retval ? retval : -EIO; } if (!extra && !reply) return 0; if (extra) { memcpy(radio->buffer, extra, extralen); memset(radio->buffer + extralen, 0, 8 - extralen); retval = usb_bulk_msg(radio->usbdev, usb_sndintpipe(radio->usbdev, 2), radio->buffer, BUFFER_LENGTH, &size, USB_TIMEOUT); } else { memset(radio->buffer, 0, 8); retval = usb_bulk_msg(radio->usbdev, usb_rcvbulkpipe(radio->usbdev, 0x81), radio->buffer, BUFFER_LENGTH, &size, USB_TIMEOUT); } if (retval == 0 && size == BUFFER_LENGTH) return 0; if (video_is_registered(&radio->vdev) && cmd != AMRADIO_GET_READY_FLAG) amradio_dev_warn(&radio->vdev.dev, "follow-up to cmd %02x failed\n", cmd); return retval ? retval : -EIO; } /* switch on/off the radio. Send 8 bytes to device */ static int amradio_set_mute(struct amradio_device *radio, bool mute) { int ret = amradio_send_cmd(radio, AMRADIO_SET_MUTE, mute, NULL, 0, false); if (!ret) radio->muted = mute; return ret; } /* set a frequency, freq is defined by v4l's TUNER_LOW, i.e. 1/16th kHz */ static int amradio_set_freq(struct amradio_device *radio, int freq) { unsigned short freq_send; u8 buf[3]; int retval; /* we need to be sure that frequency isn't out of range */ freq = clamp_t(unsigned, freq, FREQ_MIN * FREQ_MUL, FREQ_MAX * FREQ_MUL); freq_send = 0x10 + (freq >> 3) / 25; /* frequency is calculated from freq_send and placed in first 2 bytes */ buf[0] = (freq_send >> 8) & 0xff; buf[1] = freq_send & 0xff; buf[2] = 0x01; retval = amradio_send_cmd(radio, AMRADIO_SET_FREQ, 0, buf, 3, false); if (retval) return retval; radio->curfreq = freq; msleep(40); return 0; } static int amradio_set_stereo(struct amradio_device *radio, bool stereo) { int ret = amradio_send_cmd(radio, AMRADIO_SET_MONO, !stereo, NULL, 0, false); if (!ret) radio->stereo = stereo; return ret; } static int amradio_get_stat(struct amradio_device *radio, bool *is_stereo, u32 *signal) { int ret = amradio_send_cmd(radio, AMRADIO_GET_SIGNAL, 0, NULL, 0, true); if (ret) return ret; *is_stereo = radio->buffer[2] >> 7; *signal = (radio->buffer[3] & 0xf0) << 8; return 0; } /* Handle unplugging the device. * We call video_unregister_device in any case. * The last function called in this procedure is * usb_amradio_device_release. */ static void usb_amradio_disconnect(struct usb_interface *intf) { struct amradio_device *radio = to_amradio_dev(usb_get_intfdata(intf)); mutex_lock(&radio->lock); video_unregister_device(&radio->vdev); amradio_set_mute(radio, true); usb_set_intfdata(intf, NULL); v4l2_device_disconnect(&radio->v4l2_dev); mutex_unlock(&radio->lock); v4l2_device_put(&radio->v4l2_dev); } /* vidioc_querycap - query device capabilities */ static int vidioc_querycap(struct file *file, void *priv, struct v4l2_capability *v) { struct amradio_device *radio = video_drvdata(file); strscpy(v->driver, "radio-mr800", sizeof(v->driver)); strscpy(v->card, "AverMedia MR 800 USB FM Radio", sizeof(v->card)); usb_make_path(radio->usbdev, v->bus_info, sizeof(v->bus_info)); return 0; } /* vidioc_g_tuner - get tuner attributes */ static int vidioc_g_tuner(struct file *file, void *priv, struct v4l2_tuner *v) { struct amradio_device *radio = video_drvdata(file); bool is_stereo = false; int retval; if (v->index > 0) return -EINVAL; v->signal = 0; retval = amradio_get_stat(radio, &is_stereo, &v->signal); if (retval) return retval; strscpy(v->name, "FM", sizeof(v->name)); v->type = V4L2_TUNER_RADIO; v->rangelow = FREQ_MIN * FREQ_MUL; v->rangehigh = FREQ_MAX * FREQ_MUL; v->capability = V4L2_TUNER_CAP_LOW | V4L2_TUNER_CAP_STEREO | V4L2_TUNER_CAP_HWSEEK_WRAP; v->rxsubchans = is_stereo ? V4L2_TUNER_SUB_STEREO : V4L2_TUNER_SUB_MONO; v->audmode = radio->stereo ? V4L2_TUNER_MODE_STEREO : V4L2_TUNER_MODE_MONO; return 0; } /* vidioc_s_tuner - set tuner attributes */ static int vidioc_s_tuner(struct file *file, void *priv, const struct v4l2_tuner *v) { struct amradio_device *radio = video_drvdata(file); if (v->index > 0) return -EINVAL; /* mono/stereo selector */ switch (v->audmode) { case V4L2_TUNER_MODE_MONO: return amradio_set_stereo(radio, WANT_MONO); default: return amradio_set_stereo(radio, WANT_STEREO); } } /* vidioc_s_frequency - set tuner radio frequency */ static int vidioc_s_frequency(struct file *file, void *priv, const struct v4l2_frequency *f) { struct amradio_device *radio = video_drvdata(file); if (f->tuner != 0) return -EINVAL; return amradio_set_freq(radio, f->frequency); } /* vidioc_g_frequency - get tuner radio frequency */ static int vidioc_g_frequency(struct file *file, void *priv, struct v4l2_frequency *f) { struct amradio_device *radio = video_drvdata(file); if (f->tuner != 0 || f->type != V4L2_TUNER_RADIO) return -EINVAL; f->type = V4L2_TUNER_RADIO; f->frequency = radio->curfreq; return 0; } static int vidioc_s_hw_freq_seek(struct file *file, void *priv, const struct v4l2_hw_freq_seek *seek) { static u8 buf[8] = { 0x3d, 0x32, 0x0f, 0x08, 0x3d, 0x32, 0x0f, 0x08 }; struct amradio_device *radio = video_drvdata(file); unsigned long timeout; int retval; if (seek->tuner != 0 || !seek->wrap_around) return -EINVAL; if (file->f_flags & O_NONBLOCK) return -EWOULDBLOCK; retval = amradio_send_cmd(radio, AMRADIO_SET_SEARCH_LVL, 0, buf, 8, false); if (retval) return retval; amradio_set_freq(radio, radio->curfreq); retval = amradio_send_cmd(radio, seek->seek_upward ? AMRADIO_SET_SEARCH_UP : AMRADIO_SET_SEARCH_DOWN, 0, NULL, 0, false); if (retval) return retval; timeout = jiffies + msecs_to_jiffies(30000); for (;;) { if (time_after(jiffies, timeout)) { retval = -ENODATA; break; } if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { retval = -ERESTARTSYS; break; } retval = amradio_send_cmd(radio, AMRADIO_GET_READY_FLAG, 0, NULL, 0, true); if (retval) continue; amradio_send_cmd(radio, AMRADIO_GET_FREQ, 0, NULL, 0, true); if (radio->buffer[1] || radio->buffer[2]) { /* To check: sometimes radio->curfreq is set to out of range value */ radio->curfreq = (radio->buffer[1] << 8) | radio->buffer[2]; radio->curfreq = (radio->curfreq - 0x10) * 200; amradio_send_cmd(radio, AMRADIO_STOP_SEARCH, 0, NULL, 0, false); amradio_set_freq(radio, radio->curfreq); retval = 0; break; } } amradio_send_cmd(radio, AMRADIO_STOP_SEARCH, 0, NULL, 0, false); amradio_set_freq(radio, radio->curfreq); return retval; } static int usb_amradio_s_ctrl(struct v4l2_ctrl *ctrl) { struct amradio_device *radio = container_of(ctrl->handler, struct amradio_device, hdl); switch (ctrl->id) { case V4L2_CID_AUDIO_MUTE: return amradio_set_mute(radio, ctrl->val); } return -EINVAL; } static int usb_amradio_init(struct amradio_device *radio) { int retval; retval = amradio_set_mute(radio, true); if (retval) goto out_err; retval = amradio_set_stereo(radio, true); if (retval) goto out_err; retval = amradio_set_freq(radio, radio->curfreq); if (retval) goto out_err; return 0; out_err: amradio_dev_err(&radio->vdev.dev, "initialization failed\n"); return retval; } /* Suspend device - stop device. Need to be checked and fixed */ static int usb_amradio_suspend(struct usb_interface *intf, pm_message_t message) { struct amradio_device *radio = to_amradio_dev(usb_get_intfdata(intf)); mutex_lock(&radio->lock); if (!radio->muted) { amradio_set_mute(radio, true); radio->muted = false; } mutex_unlock(&radio->lock); dev_info(&intf->dev, "going into suspend..\n"); return 0; } /* Resume device - start device. Need to be checked and fixed */ static int usb_amradio_resume(struct usb_interface *intf) { struct amradio_device *radio = to_amradio_dev(usb_get_intfdata(intf)); mutex_lock(&radio->lock); amradio_set_stereo(radio, radio->stereo); amradio_set_freq(radio, radio->curfreq); if (!radio->muted) amradio_set_mute(radio, false); mutex_unlock(&radio->lock); dev_info(&intf->dev, "coming out of suspend..\n"); return 0; } static const struct v4l2_ctrl_ops usb_amradio_ctrl_ops = { .s_ctrl = usb_amradio_s_ctrl, }; /* File system interface */ static const struct v4l2_file_operations usb_amradio_fops = { .owner = THIS_MODULE, .open = v4l2_fh_open, .release = v4l2_fh_release, .poll = v4l2_ctrl_poll, .unlocked_ioctl = video_ioctl2, }; static const struct v4l2_ioctl_ops usb_amradio_ioctl_ops = { .vidioc_querycap = vidioc_querycap, .vidioc_g_tuner = vidioc_g_tuner, .vidioc_s_tuner = vidioc_s_tuner, .vidioc_g_frequency = vidioc_g_frequency, .vidioc_s_frequency = vidioc_s_frequency, .vidioc_s_hw_freq_seek = vidioc_s_hw_freq_seek, .vidioc_log_status = v4l2_ctrl_log_status, .vidioc_subscribe_event = v4l2_ctrl_subscribe_event, .vidioc_unsubscribe_event = v4l2_event_unsubscribe, }; static void usb_amradio_release(struct v4l2_device *v4l2_dev) { struct amradio_device *radio = to_amradio_dev(v4l2_dev); /* free rest memory */ v4l2_ctrl_handler_free(&radio->hdl); v4l2_device_unregister(&radio->v4l2_dev); kfree(radio->buffer); kfree(radio); } /* check if the device is present and register with v4l and usb if it is */ static int usb_amradio_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct amradio_device *radio; int retval; radio = kzalloc(sizeof(struct amradio_device), GFP_KERNEL); if (!radio) { dev_err(&intf->dev, "kmalloc for amradio_device failed\n"); retval = -ENOMEM; goto err; } radio->buffer = kmalloc(BUFFER_LENGTH, GFP_KERNEL); if (!radio->buffer) { dev_err(&intf->dev, "kmalloc for radio->buffer failed\n"); retval = -ENOMEM; goto err_nobuf; } retval = v4l2_device_register(&intf->dev, &radio->v4l2_dev); if (retval < 0) { dev_err(&intf->dev, "couldn't register v4l2_device\n"); goto err_v4l2; } v4l2_ctrl_handler_init(&radio->hdl, 1); v4l2_ctrl_new_std(&radio->hdl, &usb_amradio_ctrl_ops, V4L2_CID_AUDIO_MUTE, 0, 1, 1, 1); if (radio->hdl.error) { retval = radio->hdl.error; dev_err(&intf->dev, "couldn't register control\n"); goto err_ctrl; } mutex_init(&radio->lock); radio->v4l2_dev.ctrl_handler = &radio->hdl; radio->v4l2_dev.release = usb_amradio_release; strscpy(radio->vdev.name, radio->v4l2_dev.name, sizeof(radio->vdev.name)); radio->vdev.v4l2_dev = &radio->v4l2_dev; radio->vdev.fops = &usb_amradio_fops; radio->vdev.ioctl_ops = &usb_amradio_ioctl_ops; radio->vdev.release = video_device_release_empty; radio->vdev.lock = &radio->lock; radio->vdev.device_caps = V4L2_CAP_RADIO | V4L2_CAP_TUNER | V4L2_CAP_HW_FREQ_SEEK; radio->usbdev = interface_to_usbdev(intf); radio->intf = intf; usb_set_intfdata(intf, &radio->v4l2_dev); radio->curfreq = 95.16 * FREQ_MUL; video_set_drvdata(&radio->vdev, radio); retval = usb_amradio_init(radio); if (retval) goto err_vdev; retval = video_register_device(&radio->vdev, VFL_TYPE_RADIO, radio_nr); if (retval < 0) { dev_err(&intf->dev, "could not register video device\n"); goto err_vdev; } return 0; err_vdev: v4l2_ctrl_handler_free(&radio->hdl); err_ctrl: v4l2_device_unregister(&radio->v4l2_dev); err_v4l2: kfree(radio->buffer); err_nobuf: kfree(radio); err: return retval; } /* USB Device ID List */ static const struct usb_device_id usb_amradio_device_table[] = { { USB_DEVICE_AND_INTERFACE_INFO(USB_AMRADIO_VENDOR, USB_AMRADIO_PRODUCT, USB_CLASS_HID, 0, 0) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, usb_amradio_device_table); /* USB subsystem interface */ static struct usb_driver usb_amradio_driver = { .name = MR800_DRIVER_NAME, .probe = usb_amradio_probe, .disconnect = usb_amradio_disconnect, .suspend = usb_amradio_suspend, .resume = usb_amradio_resume, .reset_resume = usb_amradio_resume, .id_table = usb_amradio_device_table, }; module_usb_driver(usb_amradio_driver); |
6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | // SPDX-License-Identifier: GPL-2.0-only /* iptables module to match on related connections */ /* * (C) 2001 Martin Josefsson <gandalf@wlug.westbo.se> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_helper.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>"); MODULE_DESCRIPTION("Xtables: Related connection matching"); MODULE_ALIAS("ipt_helper"); MODULE_ALIAS("ip6t_helper"); static bool helper_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_helper_info *info = par->matchinfo; const struct nf_conn *ct; const struct nf_conn_help *master_help; const struct nf_conntrack_helper *helper; enum ip_conntrack_info ctinfo; bool ret = info->invert; ct = nf_ct_get(skb, &ctinfo); if (!ct || !ct->master) return ret; master_help = nfct_help(ct->master); if (!master_help) return ret; /* rcu_read_lock()ed by nf_hook_thresh */ helper = rcu_dereference(master_help->helper); if (!helper) return ret; if (info->name[0] == '\0') ret = !ret; else ret ^= !strncmp(helper->name, info->name, strlen(helper->name)); return ret; } static int helper_mt_check(const struct xt_mtchk_param *par) { struct xt_helper_info *info = par->matchinfo; int ret; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } info->name[sizeof(info->name) - 1] = '\0'; return 0; } static void helper_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_match helper_mt_reg __read_mostly = { .name = "helper", .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = helper_mt_check, .match = helper_mt, .destroy = helper_mt_destroy, .matchsize = sizeof(struct xt_helper_info), .me = THIS_MODULE, }; static int __init helper_mt_init(void) { return xt_register_match(&helper_mt_reg); } static void __exit helper_mt_exit(void) { xt_unregister_match(&helper_mt_reg); } module_init(helper_mt_init); module_exit(helper_mt_exit); |
16 16 16 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 | // SPDX-License-Identifier: GPL-2.0 #include <linux/quotaops.h> #include <linux/uuid.h> #include "ext4.h" #include "xattr.h" #include "ext4_jbd2.h" static void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, const struct fscrypt_name *src) { memset(dst, 0, sizeof(*dst)); dst->usr_fname = src->usr_fname; dst->disk_name = src->disk_name; dst->hinfo.hash = src->hash; dst->hinfo.minor_hash = src->minor_hash; dst->crypto_buf = src->crypto_buf; } int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname) { struct fscrypt_name name; int err; err = fscrypt_setup_filename(dir, iname, lookup, &name); if (err) return err; ext4_fname_from_fscrypt_name(fname, &name); err = ext4_fname_setup_ci_filename(dir, iname, fname); if (err) ext4_fname_free_filename(fname); return err; } int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, struct ext4_filename *fname) { struct fscrypt_name name; int err; err = fscrypt_prepare_lookup(dir, dentry, &name); if (err) return err; ext4_fname_from_fscrypt_name(fname, &name); err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); if (err) ext4_fname_free_filename(fname); return err; } void ext4_fname_free_filename(struct ext4_filename *fname) { struct fscrypt_name name; name.crypto_buf = fname->crypto_buf; fscrypt_free_filename(&name); fname->crypto_buf.name = NULL; fname->usr_fname = NULL; fname->disk_name.name = NULL; ext4_fname_free_ci_filename(fname); } static bool uuid_is_zero(__u8 u[16]) { int i; for (i = 0; i < 16; i++) if (u[i]) return false; return true; } int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg) { struct super_block *sb = file_inode(filp)->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); int err, err2; handle_t *handle; if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { err = mnt_want_write_file(filp); if (err) return err; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto pwsalt_err_exit; } err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto pwsalt_err_journal; lock_buffer(sbi->s_sbh); generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); pwsalt_err_journal: err2 = ext4_journal_stop(handle); if (err2 && !err) err = err2; pwsalt_err_exit: mnt_drop_write_file(filp); if (err) return err; } if (copy_to_user(arg, sbi->s_es->s_encrypt_pw_salt, 16)) return -EFAULT; return 0; } static int ext4_get_context(struct inode *inode, void *ctx, size_t len) { return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); } static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { handle_t *handle = fs_data; int res, res2, credits, retries = 0; /* * Encrypting the root directory is not allowed because e2fsck expects * lost+found to exist and be unencrypted, and encrypting the root * directory would imply encrypting the lost+found directory as well as * the filename "lost+found" itself. */ if (inode->i_ino == EXT4_ROOT_INO) return -EPERM; if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) return -EINVAL; if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) return -EOPNOTSUPP; res = ext4_convert_inline_data(inode); if (res) return res; /* * If a journal handle was specified, then the encryption context is * being set on a new inode via inheritance and is part of a larger * transaction to create the inode. Otherwise the encryption context is * being set on an existing inode in its own transaction. Only in the * latter case should the "retry on ENOSPC" logic be used. */ if (handle) { res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); /* * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ ext4_set_inode_flags(inode, false); } return res; } res = dquot_initialize(inode); if (res) return res; retry: res = ext4_xattr_set_credits(inode, len, false /* is_create */, &credits); if (res) return res; handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); if (IS_ERR(handle)) return PTR_ERR(handle); res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); /* * Update inode->i_flags - S_ENCRYPTED will be enabled, * S_DAX may be disabled */ ext4_set_inode_flags(inode, false); res = ext4_mark_inode_dirty(handle, inode); if (res) EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); } res2 = ext4_journal_stop(handle); if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; if (!res) res = res2; return res; } static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) { return EXT4_SB(sb)->s_dummy_enc_policy.policy; } static bool ext4_has_stable_inodes(struct super_block *sb) { return ext4_has_feature_stable_inodes(sb); } const struct fscrypt_operations ext4_cryptops = { .needs_bounce_pages = 1, .has_32bit_inodes = 1, .supports_subblock_data_units = 1, .legacy_key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, .has_stable_inodes = ext4_has_stable_inodes, }; |
6 6 6 6 6 7 7 1 81 81 30 81 81 80 29 29 85 86 30 80 81 81 81 86 85 86 86 86 86 79 81 8 81 29 29 29 29 7 7 7 6 1 10 10 9 1 7 13 13 7 6 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 | // SPDX-License-Identifier: GPL-2.0 /* * Block device elevator/IO-scheduler. * * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE * * 30042000 Jens Axboe <axboe@kernel.dk> : * * Split the elevator a bit so that it is possible to choose a different * one or even write a new "plug in". There are three pieces: * - elevator_fn, inserts a new request in the queue list * - elevator_merge_fn, decides whether a new buffer can be merged with * an existing request * - elevator_dequeue_fn, called when a request is taken off the active list * * 20082000 Dave Jones <davej@suse.de> : * Removed tests for max-bomb-segments, which was breaking elvtune * when run without -bN * * Jens: * - Rework again to work with bio instead of buffer_heads * - loose bi_dev comparisons, partition handling is right now * - completely modularize elevator setup and teardown * */ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/compiler.h> #include <linux/blktrace_api.h> #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/pm_runtime.h> #include <trace/events/block.h> #include "elevator.h" #include "blk.h" #include "blk-mq-sched.h" #include "blk-pm.h" #include "blk-wbt.h" #include "blk-cgroup.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); /* * Merge hash stuff. */ #define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) /* * Query io scheduler to see if the current process issuing bio may be * merged with rq. */ static bool elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; if (e->type->ops.allow_merge) return e->type->ops.allow_merge(q, rq, bio); return true; } /* * can we safely merge with this request? */ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) { if (!blk_rq_merge_ok(rq, bio)) return false; if (!elv_iosched_allow_bio_merge(rq, bio)) return false; return true; } EXPORT_SYMBOL(elv_bio_merge_ok); /** * elevator_match - Check whether @e's name or alias matches @name * @e: Scheduler to test * @name: Elevator name to test * * Return true if the elevator @e's name or alias matches @name. */ static bool elevator_match(const struct elevator_type *e, const char *name) { return !strcmp(e->elevator_name, name) || (e->elevator_alias && !strcmp(e->elevator_alias, name)); } static struct elevator_type *__elevator_find(const char *name) { struct elevator_type *e; list_for_each_entry(e, &elv_list, list) if (elevator_match(e, name)) return e; return NULL; } static struct elevator_type *elevator_find_get(const char *name) { struct elevator_type *e; spin_lock(&elv_list_lock); e = __elevator_find(name); if (e && (!elevator_tryget(e))) e = NULL; spin_unlock(&elv_list_lock); return e; } static const struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, struct elevator_type *e) { struct elevator_queue *eq; eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node); if (unlikely(!eq)) return NULL; __elevator_get(e); eq->type = e; kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); hash_init(eq->hash); return eq; } EXPORT_SYMBOL(elevator_alloc); static void elevator_release(struct kobject *kobj) { struct elevator_queue *e; e = container_of(kobj, struct elevator_queue, kobj); elevator_put(e->type); kfree(e); } void elevator_exit(struct request_queue *q) { struct elevator_queue *e = q->elevator; ioc_clear_queue(q); blk_mq_sched_free_rqs(q); mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); mutex_unlock(&e->sysfs_lock); kobject_put(&e->kobj); } static inline void __elv_rqhash_del(struct request *rq) { hash_del(&rq->hash); rq->rq_flags &= ~RQF_HASHED; } void elv_rqhash_del(struct request_queue *q, struct request *rq) { if (ELV_ON_HASH(rq)) __elv_rqhash_del(rq); } EXPORT_SYMBOL_GPL(elv_rqhash_del); void elv_rqhash_add(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; BUG_ON(ELV_ON_HASH(rq)); hash_add(e->hash, &rq->hash, rq_hash_key(rq)); rq->rq_flags |= RQF_HASHED; } EXPORT_SYMBOL_GPL(elv_rqhash_add); void elv_rqhash_reposition(struct request_queue *q, struct request *rq) { __elv_rqhash_del(rq); elv_rqhash_add(q, rq); } struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) { struct elevator_queue *e = q->elevator; struct hlist_node *next; struct request *rq; hash_for_each_possible_safe(e->hash, rq, next, hash, offset) { BUG_ON(!ELV_ON_HASH(rq)); if (unlikely(!rq_mergeable(rq))) { __elv_rqhash_del(rq); continue; } if (rq_hash_key(rq) == offset) return rq; } return NULL; } /* * RB-tree support functions for inserting/lookup/removal of requests * in a sorted RB tree. */ void elv_rb_add(struct rb_root *root, struct request *rq) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct request *__rq; while (*p) { parent = *p; __rq = rb_entry(parent, struct request, rb_node); if (blk_rq_pos(rq) < blk_rq_pos(__rq)) p = &(*p)->rb_left; else if (blk_rq_pos(rq) >= blk_rq_pos(__rq)) p = &(*p)->rb_right; } rb_link_node(&rq->rb_node, parent, p); rb_insert_color(&rq->rb_node, root); } EXPORT_SYMBOL(elv_rb_add); void elv_rb_del(struct rb_root *root, struct request *rq) { BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); rb_erase(&rq->rb_node, root); RB_CLEAR_NODE(&rq->rb_node); } EXPORT_SYMBOL(elv_rb_del); struct request *elv_rb_find(struct rb_root *root, sector_t sector) { struct rb_node *n = root->rb_node; struct request *rq; while (n) { rq = rb_entry(n, struct request, rb_node); if (sector < blk_rq_pos(rq)) n = n->rb_left; else if (sector > blk_rq_pos(rq)) n = n->rb_right; else return rq; } return NULL; } EXPORT_SYMBOL(elv_rb_find); enum elv_merge elv_merge(struct request_queue *q, struct request **req, struct bio *bio) { struct elevator_queue *e = q->elevator; struct request *__rq; /* * Levels of merges: * nomerges: No merges at all attempted * noxmerges: Only simple one-hit cache try * merges: All merge tries attempted */ if (blk_queue_nomerges(q) || !bio_mergeable(bio)) return ELEVATOR_NO_MERGE; /* * First try one-hit cache. */ if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) { enum elv_merge ret = blk_try_merge(q->last_merge, bio); if (ret != ELEVATOR_NO_MERGE) { *req = q->last_merge; return ret; } } if (blk_queue_noxmerges(q)) return ELEVATOR_NO_MERGE; /* * See if our hash lookup can find a potential backmerge. */ __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; if (blk_discard_mergable(__rq)) return ELEVATOR_DISCARD_MERGE; return ELEVATOR_BACK_MERGE; } if (e->type->ops.request_merge) return e->type->ops.request_merge(q, req, bio); return ELEVATOR_NO_MERGE; } /* * Attempt to do an insertion back merge. Only check for the case where * we can append 'rq' to an existing request, so we can throw 'rq' away * afterwards. * * Returns true if we merged, false otherwise. 'free' will contain all * requests that need to be freed. */ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free) { struct request *__rq; bool ret; if (blk_queue_nomerges(q)) return false; /* * First try one-hit cache. */ if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) { list_add(&rq->queuelist, free); return true; } if (blk_queue_noxmerges(q)) return false; ret = false; /* * See if our hash lookup can find a potential backmerge. */ while (1) { __rq = elv_rqhash_find(q, blk_rq_pos(rq)); if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) break; list_add(&rq->queuelist, free); /* The merged request could be merged with others, try again */ ret = true; rq = __rq; } return ret; } void elv_merged_request(struct request_queue *q, struct request *rq, enum elv_merge type) { struct elevator_queue *e = q->elevator; if (e->type->ops.request_merged) e->type->ops.request_merged(q, rq, type); if (type == ELEVATOR_BACK_MERGE) elv_rqhash_reposition(q, rq); q->last_merge = rq; } void elv_merge_requests(struct request_queue *q, struct request *rq, struct request *next) { struct elevator_queue *e = q->elevator; if (e->type->ops.requests_merged) e->type->ops.requests_merged(q, rq, next); elv_rqhash_reposition(q, rq); q->last_merge = rq; } struct request *elv_latter_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; if (e->type->ops.next_request) return e->type->ops.next_request(q, rq); return NULL; } struct request *elv_former_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; if (e->type->ops.former_request) return e->type->ops.former_request(q, rq); return NULL; } #define to_elv(atr) container_of_const((atr), struct elv_fs_entry, attr) static ssize_t elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { const struct elv_fs_entry *entry = to_elv(attr); struct elevator_queue *e; ssize_t error; if (!entry->show) return -EIO; e = container_of(kobj, struct elevator_queue, kobj); mutex_lock(&e->sysfs_lock); error = e->type ? entry->show(e, page) : -ENOENT; mutex_unlock(&e->sysfs_lock); return error; } static ssize_t elv_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { const struct elv_fs_entry *entry = to_elv(attr); struct elevator_queue *e; ssize_t error; if (!entry->store) return -EIO; e = container_of(kobj, struct elevator_queue, kobj); mutex_lock(&e->sysfs_lock); error = e->type ? entry->store(e, page, length) : -ENOENT; mutex_unlock(&e->sysfs_lock); return error; } static const struct sysfs_ops elv_sysfs_ops = { .show = elv_attr_show, .store = elv_attr_store, }; static const struct kobj_type elv_ktype = { .sysfs_ops = &elv_sysfs_ops, .release = elevator_release, }; int elv_register_queue(struct request_queue *q, bool uevent) { struct elevator_queue *e = q->elevator; int error; lockdep_assert_held(&q->sysfs_lock); error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { const struct elv_fs_entry *attr = e->type->elevator_attrs; if (attr) { while (attr->attr.name) { if (sysfs_create_file(&e->kobj, &attr->attr)) break; attr++; } } if (uevent) kobject_uevent(&e->kobj, KOBJ_ADD); set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags); } return error; } void elv_unregister_queue(struct request_queue *q) { struct elevator_queue *e = q->elevator; lockdep_assert_held(&q->sysfs_lock); if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); } } int elv_register(struct elevator_type *e) { /* finish request is mandatory */ if (WARN_ON_ONCE(!e->ops.finish_request)) return -EINVAL; /* insert_requests and dispatch_request are mandatory */ if (WARN_ON_ONCE(!e->ops.insert_requests || !e->ops.dispatch_request)) return -EINVAL; /* create icq_cache if requested */ if (e->icq_size) { if (WARN_ON(e->icq_size < sizeof(struct io_cq)) || WARN_ON(e->icq_align < __alignof__(struct io_cq))) return -EINVAL; snprintf(e->icq_cache_name, sizeof(e->icq_cache_name), "%s_io_cq", e->elevator_name); e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size, e->icq_align, 0, NULL); if (!e->icq_cache) return -ENOMEM; } /* register, don't allow duplicate names */ spin_lock(&elv_list_lock); if (__elevator_find(e->elevator_name)) { spin_unlock(&elv_list_lock); kmem_cache_destroy(e->icq_cache); return -EBUSY; } list_add_tail(&e->list, &elv_list); spin_unlock(&elv_list_lock); printk(KERN_INFO "io scheduler %s registered\n", e->elevator_name); return 0; } EXPORT_SYMBOL_GPL(elv_register); void elv_unregister(struct elevator_type *e) { /* unregister */ spin_lock(&elv_list_lock); list_del_init(&e->list); spin_unlock(&elv_list_lock); /* * Destroy icq_cache if it exists. icq's are RCU managed. Make * sure all RCU operations are complete before proceeding. */ if (e->icq_cache) { rcu_barrier(); kmem_cache_destroy(e->icq_cache); e->icq_cache = NULL; } } EXPORT_SYMBOL_GPL(elv_unregister); /* * For single queue devices, default to using mq-deadline. If we have multiple * queues or mq-deadline is not available, default to "none". */ static struct elevator_type *elevator_get_default(struct request_queue *q) { if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) return NULL; if (q->nr_hw_queues != 1 && !blk_mq_is_shared_tags(q->tag_set->flags)) return NULL; return elevator_find_get("mq-deadline"); } /* * Use the default elevator settings. If the chosen elevator initialization * fails, fall back to the "none" elevator (no elevator). */ void elevator_init_mq(struct request_queue *q) { struct elevator_type *e; unsigned int memflags; int err; WARN_ON_ONCE(blk_queue_registered(q)); if (unlikely(q->elevator)) return; e = elevator_get_default(q); if (!e) return; /* * We are called before adding disk, when there isn't any FS I/O, * so freezing queue plus canceling dispatch work is enough to * drain any dispatch activities originated from passthrough * requests, then no need to quiesce queue which may add long boot * latency, especially when lots of disks are involved. * * Disk isn't added yet, so verifying queue lock only manually. */ memflags = blk_mq_freeze_queue(q); blk_mq_cancel_work_sync(q); err = blk_mq_init_sched(q, e); blk_mq_unfreeze_queue(q, memflags); if (err) { pr_warn("\"%s\" elevator initialization failed, " "falling back to \"none\"\n", e->elevator_name); } elevator_put(e); } /* * Switch to new_e io scheduler. * * If switching fails, we are most likely running out of memory and not able * to restore the old io scheduler, so leaving the io scheduler being none. */ int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { unsigned int memflags; int ret; lockdep_assert_held(&q->sysfs_lock); memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); if (q->elevator) { elv_unregister_queue(q); elevator_exit(q); } ret = blk_mq_init_sched(q, new_e); if (ret) goto out_unfreeze; ret = elv_register_queue(q, true); if (ret) { elevator_exit(q); goto out_unfreeze; } blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); out_unfreeze: blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q, memflags); if (ret) { pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n", new_e->elevator_name); } return ret; } void elevator_disable(struct request_queue *q) { unsigned int memflags; lockdep_assert_held(&q->sysfs_lock); memflags = blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); elv_unregister_queue(q); elevator_exit(q); blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); q->elevator = NULL; q->nr_requests = q->tag_set->queue_depth; blk_add_trace_msg(q, "elv switch: none"); blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q, memflags); } /* * Switch this queue to the given IO scheduler. */ static int elevator_change(struct request_queue *q, const char *elevator_name) { struct elevator_type *e; int ret; /* Make sure queue is not in the middle of being removed */ if (!blk_queue_registered(q)) return -ENOENT; if (!strncmp(elevator_name, "none", 4)) { if (q->elevator) elevator_disable(q); return 0; } if (q->elevator && elevator_match(q->elevator->type, elevator_name)) return 0; e = elevator_find_get(elevator_name); if (!e) return -EINVAL; ret = elevator_switch(q, e); elevator_put(e); return ret; } void elv_iosched_load_module(struct gendisk *disk, const char *buf, size_t count) { char elevator_name[ELV_NAME_MAX]; struct elevator_type *found; const char *name; strscpy(elevator_name, buf, sizeof(elevator_name)); name = strstrip(elevator_name); spin_lock(&elv_list_lock); found = __elevator_find(name); spin_unlock(&elv_list_lock); if (!found) request_module("%s-iosched", name); } ssize_t elv_iosched_store(struct gendisk *disk, const char *buf, size_t count) { char elevator_name[ELV_NAME_MAX]; int ret; strscpy(elevator_name, buf, sizeof(elevator_name)); ret = elevator_change(disk->queue, strstrip(elevator_name)); if (!ret) return count; return ret; } ssize_t elv_iosched_show(struct gendisk *disk, char *name) { struct request_queue *q = disk->queue; struct elevator_queue *eq = q->elevator; struct elevator_type *cur = NULL, *e; int len = 0; if (!q->elevator) { len += sprintf(name+len, "[none] "); } else { len += sprintf(name+len, "none "); cur = eq->type; } spin_lock(&elv_list_lock); list_for_each_entry(e, &elv_list, list) { if (e == cur) len += sprintf(name+len, "[%s] ", e->elevator_name); else len += sprintf(name+len, "%s ", e->elevator_name); } spin_unlock(&elv_list_lock); len += sprintf(name+len, "\n"); return len; } struct request *elv_rb_former_request(struct request_queue *q, struct request *rq) { struct rb_node *rbprev = rb_prev(&rq->rb_node); if (rbprev) return rb_entry_rq(rbprev); return NULL; } EXPORT_SYMBOL(elv_rb_former_request); struct request *elv_rb_latter_request(struct request_queue *q, struct request *rq) { struct rb_node *rbnext = rb_next(&rq->rb_node); if (rbnext) return rb_entry_rq(rbnext); return NULL; } EXPORT_SYMBOL(elv_rb_latter_request); static int __init elevator_setup(char *str) { pr_warn("Kernel parameter elevator= does not have any effect anymore.\n" "Please use sysfs to set IO scheduler for individual devices.\n"); return 1; } __setup("elevator=", elevator_setup); |
1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Roccat Ryos driver for Linux * * Copyright (c) 2013 Stefan Achatz <erazor_de@users.sourceforge.net> */ /* */ #include <linux/types.h> #include <linux/device.h> #include <linux/input.h> #include <linux/hid.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/hid-roccat.h> #include "hid-ids.h" #include "hid-roccat-common.h" enum { RYOS_REPORT_NUMBER_SPECIAL = 3, RYOS_USB_INTERFACE_PROTOCOL = 0, }; struct ryos_report_special { uint8_t number; /* RYOS_REPORT_NUMBER_SPECIAL */ uint8_t data[4]; } __packed; ROCCAT_COMMON2_BIN_ATTRIBUTE_W(control, 0x04, 0x03); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(profile, 0x05, 0x03); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(keys_primary, 0x06, 0x7d); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(keys_function, 0x07, 0x5f); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(keys_macro, 0x08, 0x23); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(keys_thumbster, 0x09, 0x17); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(keys_extra, 0x0a, 0x08); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(keys_easyzone, 0x0b, 0x126); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(key_mask, 0x0c, 0x06); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(light, 0x0d, 0x10); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(macro, 0x0e, 0x7d2); ROCCAT_COMMON2_BIN_ATTRIBUTE_R(info, 0x0f, 0x08); ROCCAT_COMMON2_BIN_ATTRIBUTE_W(reset, 0x11, 0x03); ROCCAT_COMMON2_BIN_ATTRIBUTE_W(light_control, 0x13, 0x08); ROCCAT_COMMON2_BIN_ATTRIBUTE_W(talk, 0x16, 0x10); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(stored_lights, 0x17, 0x0566); ROCCAT_COMMON2_BIN_ATTRIBUTE_W(custom_lights, 0x18, 0x14); ROCCAT_COMMON2_BIN_ATTRIBUTE_RW(light_macro, 0x19, 0x07d2); static const struct bin_attribute *const ryos_bin_attrs[] = { &bin_attr_control, &bin_attr_profile, &bin_attr_keys_primary, &bin_attr_keys_function, &bin_attr_keys_macro, &bin_attr_keys_thumbster, &bin_attr_keys_extra, &bin_attr_keys_easyzone, &bin_attr_key_mask, &bin_attr_light, &bin_attr_macro, &bin_attr_info, &bin_attr_reset, &bin_attr_light_control, &bin_attr_talk, &bin_attr_stored_lights, &bin_attr_custom_lights, &bin_attr_light_macro, NULL, }; static const struct attribute_group ryos_group = { .bin_attrs_new = ryos_bin_attrs, }; static const struct attribute_group *ryos_groups[] = { &ryos_group, NULL, }; static const struct class ryos_class = { .name = "ryos", .dev_groups = ryos_groups, }; static int ryos_init_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct usb_device *usb_dev = interface_to_usbdev(intf); struct roccat_common2_device *ryos; int retval; if (intf->cur_altsetting->desc.bInterfaceProtocol != RYOS_USB_INTERFACE_PROTOCOL) { hid_set_drvdata(hdev, NULL); return 0; } ryos = kzalloc(sizeof(*ryos), GFP_KERNEL); if (!ryos) { hid_err(hdev, "can't alloc device descriptor\n"); return -ENOMEM; } hid_set_drvdata(hdev, ryos); retval = roccat_common2_device_init_struct(usb_dev, ryos); if (retval) { hid_err(hdev, "couldn't init Ryos device\n"); goto exit_free; } retval = roccat_connect(&ryos_class, hdev, sizeof(struct ryos_report_special)); if (retval < 0) { hid_err(hdev, "couldn't init char dev\n"); } else { ryos->chrdev_minor = retval; ryos->roccat_claimed = 1; } return 0; exit_free: kfree(ryos); return retval; } static void ryos_remove_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct roccat_common2_device *ryos; if (intf->cur_altsetting->desc.bInterfaceProtocol != RYOS_USB_INTERFACE_PROTOCOL) return; ryos = hid_get_drvdata(hdev); if (ryos->roccat_claimed) roccat_disconnect(ryos->chrdev_minor); kfree(ryos); } static int ryos_probe(struct hid_device *hdev, const struct hid_device_id *id) { int retval; if (!hid_is_usb(hdev)) return -EINVAL; retval = hid_parse(hdev); if (retval) { hid_err(hdev, "parse failed\n"); goto exit; } retval = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (retval) { hid_err(hdev, "hw start failed\n"); goto exit; } retval = ryos_init_specials(hdev); if (retval) { hid_err(hdev, "couldn't install mouse\n"); goto exit_stop; } return 0; exit_stop: hid_hw_stop(hdev); exit: return retval; } static void ryos_remove(struct hid_device *hdev) { ryos_remove_specials(hdev); hid_hw_stop(hdev); } static int ryos_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct roccat_common2_device *ryos = hid_get_drvdata(hdev); if (intf->cur_altsetting->desc.bInterfaceProtocol != RYOS_USB_INTERFACE_PROTOCOL) return 0; if (data[0] != RYOS_REPORT_NUMBER_SPECIAL) return 0; if (ryos != NULL && ryos->roccat_claimed) roccat_report_event(ryos->chrdev_minor, data); return 0; } static const struct hid_device_id ryos_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_RYOS_MK) }, { HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_RYOS_MK_GLOW) }, { HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_RYOS_MK_PRO) }, { } }; MODULE_DEVICE_TABLE(hid, ryos_devices); static struct hid_driver ryos_driver = { .name = "ryos", .id_table = ryos_devices, .probe = ryos_probe, .remove = ryos_remove, .raw_event = ryos_raw_event }; static int __init ryos_init(void) { int retval; retval = class_register(&ryos_class); if (retval) return retval; retval = hid_register_driver(&ryos_driver); if (retval) class_unregister(&ryos_class); return retval; } static void __exit ryos_exit(void) { hid_unregister_driver(&ryos_driver); class_unregister(&ryos_class); } module_init(ryos_init); module_exit(ryos_exit); MODULE_AUTHOR("Stefan Achatz"); MODULE_DESCRIPTION("USB Roccat Ryos MK/Glow/Pro driver"); MODULE_LICENSE("GPL v2"); |
6 4 2 1 2 88 89 6 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 | // SPDX-License-Identifier: GPL-2.0-only /* * Persistent Storage - ramfs parts. * * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com> */ #include <linux/module.h> #include <linux/fs.h> #include <linux/fsnotify.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> #include <linux/list.h> #include <linux/string.h> #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/ramfs.h> #include <linux/parser.h> #include <linux/sched.h> #include <linux/magic.h> #include <linux/pstore.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/cleanup.h> #include "internal.h" #define PSTORE_NAMELEN 64 static DEFINE_MUTEX(records_list_lock); static LIST_HEAD(records_list); static DEFINE_MUTEX(pstore_sb_lock); static struct super_block *pstore_sb; DEFINE_FREE(pstore_iput, struct inode *, if (_T) iput(_T)) struct pstore_private { struct list_head list; struct dentry *dentry; struct pstore_record *record; size_t total_size; }; struct pstore_ftrace_seq_data { const void *ptr; size_t off; size_t size; }; #define REC_SIZE sizeof(struct pstore_ftrace_record) static void free_pstore_private(struct pstore_private *private) { if (!private) return; if (private->record) { kvfree(private->record->buf); kfree(private->record->priv); kfree(private->record); } kfree(private); } DEFINE_FREE(pstore_private, struct pstore_private *, free_pstore_private(_T)); static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos) { struct pstore_private *ps = s->private; struct pstore_ftrace_seq_data *data __free(kfree) = NULL; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return NULL; data->off = ps->total_size % REC_SIZE; data->off += *pos * REC_SIZE; if (data->off + REC_SIZE > ps->total_size) return NULL; return_ptr(data); } static void pstore_ftrace_seq_stop(struct seq_file *s, void *v) { kfree(v); } static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos) { struct pstore_private *ps = s->private; struct pstore_ftrace_seq_data *data = v; (*pos)++; data->off += REC_SIZE; if (data->off + REC_SIZE > ps->total_size) return NULL; return data; } static int pstore_ftrace_seq_show(struct seq_file *s, void *v) { struct pstore_private *ps = s->private; struct pstore_ftrace_seq_data *data = v; struct pstore_ftrace_record *rec; if (!data) return 0; rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off); seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %ps <- %pS\n", pstore_ftrace_decode_cpu(rec), pstore_ftrace_read_timestamp(rec), rec->ip, rec->parent_ip, (void *)rec->ip, (void *)rec->parent_ip); return 0; } static const struct seq_operations pstore_ftrace_seq_ops = { .start = pstore_ftrace_seq_start, .next = pstore_ftrace_seq_next, .stop = pstore_ftrace_seq_stop, .show = pstore_ftrace_seq_show, }; static ssize_t pstore_file_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct seq_file *sf = file->private_data; struct pstore_private *ps = sf->private; if (ps->record->type == PSTORE_TYPE_FTRACE) return seq_read(file, userbuf, count, ppos); return simple_read_from_buffer(userbuf, count, ppos, ps->record->buf, ps->total_size); } static int pstore_file_open(struct inode *inode, struct file *file) { struct pstore_private *ps = inode->i_private; struct seq_file *sf; int err; const struct seq_operations *sops = NULL; if (ps->record->type == PSTORE_TYPE_FTRACE) sops = &pstore_ftrace_seq_ops; err = seq_open(file, sops); if (err < 0) return err; sf = file->private_data; sf->private = ps; return 0; } static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence) { struct seq_file *sf = file->private_data; if (sf->op) return seq_lseek(file, off, whence); return default_llseek(file, off, whence); } static const struct file_operations pstore_file_operations = { .open = pstore_file_open, .read = pstore_file_read, .llseek = pstore_file_llseek, .release = seq_release, }; /* * When a file is unlinked from our file system we call the * platform driver to erase the record from persistent store. */ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = d_inode(dentry)->i_private; struct pstore_record *record = p->record; if (!record->psi->erase) return -EPERM; /* Make sure we can't race while removing this file. */ scoped_guard(mutex, &records_list_lock) { if (!list_empty(&p->list)) list_del_init(&p->list); else return -ENOENT; p->dentry = NULL; } scoped_guard(mutex, &record->psi->read_mutex) record->psi->erase(record); return simple_unlink(dir, dentry); } static void pstore_evict_inode(struct inode *inode) { struct pstore_private *p = inode->i_private; clear_inode(inode); free_pstore_private(p); } static const struct inode_operations pstore_dir_inode_operations = { .lookup = simple_lookup, .unlink = pstore_unlink, }; static struct inode *pstore_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); simple_inode_init_ts(inode); } return inode; } enum { Opt_kmsg_bytes, Opt_err }; static const match_table_t tokens = { {Opt_kmsg_bytes, "kmsg_bytes=%u"}, {Opt_err, NULL} }; static void parse_options(char *options) { char *p; substring_t args[MAX_OPT_ARGS]; int option; if (!options) return; while ((p = strsep(&options, ",")) != NULL) { int token; if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_kmsg_bytes: if (!match_int(&args[0], &option)) pstore_set_kmsg_bytes(option); break; } } } /* * Display the mount options in /proc/mounts. */ static int pstore_show_options(struct seq_file *m, struct dentry *root) { if (kmsg_bytes != CONFIG_PSTORE_DEFAULT_KMSG_BYTES) seq_printf(m, ",kmsg_bytes=%lu", kmsg_bytes); return 0; } static int pstore_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); parse_options(data); return 0; } static const struct super_operations pstore_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .evict_inode = pstore_evict_inode, .remount_fs = pstore_remount, .show_options = pstore_show_options, }; static struct dentry *psinfo_lock_root(void) { struct dentry *root; guard(mutex)(&pstore_sb_lock); /* * Having no backend is fine -- no records appear. * Not being mounted is fine -- nothing to do. */ if (!psinfo || !pstore_sb) return NULL; root = pstore_sb->s_root; inode_lock(d_inode(root)); return root; } int pstore_put_backend_records(struct pstore_info *psi) { struct pstore_private *pos, *tmp; struct dentry *root; root = psinfo_lock_root(); if (!root) return 0; scoped_guard(mutex, &records_list_lock) { list_for_each_entry_safe(pos, tmp, &records_list, list) { if (pos->record->psi == psi) { list_del_init(&pos->list); d_invalidate(pos->dentry); simple_unlink(d_inode(root), pos->dentry); pos->dentry = NULL; } } } inode_unlock(d_inode(root)); return 0; } /* * Make a regular file in the root directory of our file system. * Load it up with "size" bytes of data from "buf". * Set the mtime & ctime to the date that this record was originally stored. */ int pstore_mkfile(struct dentry *root, struct pstore_record *record) { struct dentry *dentry; struct inode *inode __free(pstore_iput) = NULL; char name[PSTORE_NAMELEN]; struct pstore_private *private __free(pstore_private) = NULL, *pos; size_t size = record->size + record->ecc_notice_size; if (WARN_ON(!inode_is_locked(d_inode(root)))) return -EINVAL; guard(mutex)(&records_list_lock); /* Skip records that are already present in the filesystem. */ list_for_each_entry(pos, &records_list, list) { if (pos->record->type == record->type && pos->record->id == record->id && pos->record->psi == record->psi) return -EEXIST; } inode = pstore_get_inode(root->d_sb); if (!inode) return -ENOMEM; inode->i_mode = S_IFREG | 0444; inode->i_fop = &pstore_file_operations; scnprintf(name, sizeof(name), "%s-%s-%llu%s", pstore_type_to_name(record->type), record->psi->name, record->id, record->compressed ? ".enc.z" : ""); private = kzalloc(sizeof(*private), GFP_KERNEL); if (!private) return -ENOMEM; dentry = d_alloc_name(root, name); if (!dentry) return -ENOMEM; private->dentry = dentry; private->record = record; inode->i_size = private->total_size = size; inode->i_private = private; if (record->time.tv_sec) inode_set_mtime_to_ts(inode, inode_set_ctime_to_ts(inode, record->time)); d_add(dentry, no_free_ptr(inode)); list_add(&(no_free_ptr(private))->list, &records_list); return 0; } /* * Read all the records from the persistent store. Create * files in our filesystem. Don't warn about -EEXIST errors * when we are re-scanning the backing store looking to add new * error records. */ void pstore_get_records(int quiet) { struct dentry *root; root = psinfo_lock_root(); if (!root) return; pstore_get_backend_records(psinfo, root, quiet); inode_unlock(d_inode(root)); } static int pstore_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = PSTOREFS_MAGIC; sb->s_op = &pstore_ops; sb->s_time_gran = 1; parse_options(data); inode = pstore_get_inode(sb); if (inode) { inode->i_mode = S_IFDIR | 0750; inode->i_op = &pstore_dir_inode_operations; inode->i_fop = &simple_dir_operations; inc_nlink(inode); } sb->s_root = d_make_root(inode); if (!sb->s_root) return -ENOMEM; scoped_guard(mutex, &pstore_sb_lock) pstore_sb = sb; pstore_get_records(0); return 0; } static struct dentry *pstore_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_single(fs_type, flags, data, pstore_fill_super); } static void pstore_kill_sb(struct super_block *sb) { guard(mutex)(&pstore_sb_lock); WARN_ON(pstore_sb && pstore_sb != sb); kill_litter_super(sb); pstore_sb = NULL; guard(mutex)(&records_list_lock); INIT_LIST_HEAD(&records_list); } static struct file_system_type pstore_fs_type = { .owner = THIS_MODULE, .name = "pstore", .mount = pstore_mount, .kill_sb = pstore_kill_sb, }; int __init pstore_init_fs(void) { int err; /* Create a convenient mount point for people to access pstore */ err = sysfs_create_mount_point(fs_kobj, "pstore"); if (err) goto out; err = register_filesystem(&pstore_fs_type); if (err < 0) sysfs_remove_mount_point(fs_kobj, "pstore"); out: return err; } void __exit pstore_exit_fs(void) { unregister_filesystem(&pstore_fs_type); sysfs_remove_mount_point(fs_kobj, "pstore"); } |
48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PAGE_FRAG_CACHE_H #define _LINUX_PAGE_FRAG_CACHE_H #include <linux/bits.h> #include <linux/log2.h> #include <linux/mm_types_task.h> #include <linux/types.h> #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) /* Use a full byte here to enable assembler optimization as the shift * operation is usually expecting a byte. */ #define PAGE_FRAG_CACHE_ORDER_MASK GENMASK(7, 0) #else /* Compiler should be able to figure out we don't read things as any value * ANDed with 0 is 0. */ #define PAGE_FRAG_CACHE_ORDER_MASK 0 #endif #define PAGE_FRAG_CACHE_PFMEMALLOC_BIT (PAGE_FRAG_CACHE_ORDER_MASK + 1) static inline bool encoded_page_decode_pfmemalloc(unsigned long encoded_page) { return !!(encoded_page & PAGE_FRAG_CACHE_PFMEMALLOC_BIT); } static inline void page_frag_cache_init(struct page_frag_cache *nc) { nc->encoded_page = 0; } static inline bool page_frag_cache_is_pfmemalloc(struct page_frag_cache *nc) { return encoded_page_decode_pfmemalloc(nc->encoded_page); } void page_frag_cache_drain(struct page_frag_cache *nc); void __page_frag_cache_drain(struct page *page, unsigned int count); void *__page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align_mask); static inline void *page_frag_alloc_align(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask, unsigned int align) { WARN_ON_ONCE(!is_power_of_2(align)); return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align); } static inline void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) { return __page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u); } void page_frag_free(void *addr); #endif |
1 1 1 1 1 6 17 17 1 18 18 12 12 6 5 2 1 2 2 2 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ #include "gateway_client.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/udp.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "hard-interface.h" #include "log.h" #include "netlink.h" #include "originator.h" #include "routing.h" #include "translation-table.h" /* These are the offsets of the "hw type" and "hw address length" in the dhcp * packet starting at the beginning of the dhcp header */ #define BATADV_DHCP_HTYPE_OFFSET 1 #define BATADV_DHCP_HLEN_OFFSET 2 /* Value of htype representing Ethernet */ #define BATADV_DHCP_HTYPE_ETHERNET 0x01 /* This is the offset of the "chaddr" field in the dhcp packet starting at the * beginning of the dhcp header */ #define BATADV_DHCP_CHADDR_OFFSET 28 /** * batadv_gw_node_release() - release gw_node from lists and queue for free * after rcu grace period * @ref: kref pointer of the gw_node */ void batadv_gw_node_release(struct kref *ref) { struct batadv_gw_node *gw_node; gw_node = container_of(ref, struct batadv_gw_node, refcount); batadv_orig_node_put(gw_node->orig_node); kfree_rcu(gw_node, rcu); } /** * batadv_gw_get_selected_gw_node() - Get currently selected gateway * @bat_priv: the bat priv with all the soft interface information * * Return: selected gateway (with increased refcnt), NULL on errors */ struct batadv_gw_node * batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node; rcu_read_lock(); gw_node = rcu_dereference(bat_priv->gw.curr_gw); if (!gw_node) goto out; if (!kref_get_unless_zero(&gw_node->refcount)) gw_node = NULL; out: rcu_read_unlock(); return gw_node; } /** * batadv_gw_get_selected_orig() - Get originator of currently selected gateway * @bat_priv: the bat priv with all the soft interface information * * Return: orig_node of selected gateway (with increased refcnt), NULL on errors */ struct batadv_orig_node * batadv_gw_get_selected_orig(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node; struct batadv_orig_node *orig_node = NULL; gw_node = batadv_gw_get_selected_gw_node(bat_priv); if (!gw_node) goto out; rcu_read_lock(); orig_node = gw_node->orig_node; if (!orig_node) goto unlock; if (!kref_get_unless_zero(&orig_node->refcount)) orig_node = NULL; unlock: rcu_read_unlock(); out: batadv_gw_node_put(gw_node); return orig_node; } static void batadv_gw_select(struct batadv_priv *bat_priv, struct batadv_gw_node *new_gw_node) { struct batadv_gw_node *curr_gw_node; spin_lock_bh(&bat_priv->gw.list_lock); if (new_gw_node) kref_get(&new_gw_node->refcount); curr_gw_node = rcu_replace_pointer(bat_priv->gw.curr_gw, new_gw_node, true); batadv_gw_node_put(curr_gw_node); spin_unlock_bh(&bat_priv->gw.list_lock); } /** * batadv_gw_reselect() - force a gateway reselection * @bat_priv: the bat priv with all the soft interface information * * Set a flag to remind the GW component to perform a new gateway reselection. * However this function does not ensure that the current gateway is going to be * deselected. The reselection mechanism may elect the same gateway once again. * * This means that invoking batadv_gw_reselect() does not guarantee a gateway * change and therefore a uevent is not necessarily expected. */ void batadv_gw_reselect(struct batadv_priv *bat_priv) { atomic_set(&bat_priv->gw.reselect, 1); } /** * batadv_gw_check_client_stop() - check if client mode has been switched off * @bat_priv: the bat priv with all the soft interface information * * This function assumes the caller has checked that the gw state *is actually * changing*. This function is not supposed to be called when there is no state * change. */ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv) { struct batadv_gw_node *curr_gw; if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT) return; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (!curr_gw) return; /* deselect the current gateway so that next time that client mode is * enabled a proper GW_ADD event can be sent */ batadv_gw_select(bat_priv, NULL); /* if batman-adv is switching the gw client mode off and a gateway was * already selected, send a DEL uevent */ batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL); batadv_gw_node_put(curr_gw); } /** * batadv_gw_election() - Elect the best gateway * @bat_priv: the bat priv with all the soft interface information */ void batadv_gw_election(struct batadv_priv *bat_priv) { struct batadv_gw_node *curr_gw = NULL; struct batadv_gw_node *next_gw = NULL; struct batadv_neigh_node *router = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; char gw_addr[18] = { '\0' }; if (atomic_read(&bat_priv->gw.mode) != BATADV_GW_MODE_CLIENT) goto out; if (!bat_priv->algo_ops->gw.get_best_gw_node) goto out; curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (!batadv_atomic_dec_not_zero(&bat_priv->gw.reselect) && curr_gw) goto out; /* if gw.reselect is set to 1 it means that a previous call to * gw.is_eligible() said that we have a new best GW, therefore it can * now be picked from the list and selected */ next_gw = bat_priv->algo_ops->gw.get_best_gw_node(bat_priv); if (curr_gw == next_gw) goto out; if (next_gw) { sprintf(gw_addr, "%pM", next_gw->orig_node->orig); router = batadv_orig_router_get(next_gw->orig_node, BATADV_IF_DEFAULT); if (!router) { batadv_gw_reselect(bat_priv); goto out; } router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT); if (!router_ifinfo) { batadv_gw_reselect(bat_priv); goto out; } } if (curr_gw && !next_gw) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Removing selected gateway - no gateway in range\n"); batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL); } else if (!curr_gw && next_gw) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Adding route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n", next_gw->orig_node->orig, next_gw->bandwidth_down / 10, next_gw->bandwidth_down % 10, next_gw->bandwidth_up / 10, next_gw->bandwidth_up % 10, router_ifinfo->bat_iv.tq_avg); batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_ADD, gw_addr); } else { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Changing route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n", next_gw->orig_node->orig, next_gw->bandwidth_down / 10, next_gw->bandwidth_down % 10, next_gw->bandwidth_up / 10, next_gw->bandwidth_up % 10, router_ifinfo->bat_iv.tq_avg); batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_CHANGE, gw_addr); } batadv_gw_select(bat_priv, next_gw); out: batadv_gw_node_put(curr_gw); batadv_gw_node_put(next_gw); batadv_neigh_node_put(router); batadv_neigh_ifinfo_put(router_ifinfo); } /** * batadv_gw_check_election() - Elect orig node as best gateway when eligible * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is to be checked */ void batadv_gw_check_election(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_orig_node *curr_gw_orig; /* abort immediately if the routing algorithm does not support gateway * election */ if (!bat_priv->algo_ops->gw.is_eligible) return; curr_gw_orig = batadv_gw_get_selected_orig(bat_priv); if (!curr_gw_orig) goto reselect; /* this node already is the gateway */ if (curr_gw_orig == orig_node) goto out; if (!bat_priv->algo_ops->gw.is_eligible(bat_priv, curr_gw_orig, orig_node)) goto out; reselect: batadv_gw_reselect(bat_priv); out: batadv_orig_node_put(curr_gw_orig); } /** * batadv_gw_node_add() - add gateway node to list of available gateways * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * @gateway: announced bandwidth information * * Has to be called with the appropriate locks being acquired * (gw.list_lock). */ static void batadv_gw_node_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_tvlv_gateway_data *gateway) { struct batadv_gw_node *gw_node; lockdep_assert_held(&bat_priv->gw.list_lock); if (gateway->bandwidth_down == 0) return; gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC); if (!gw_node) return; kref_init(&gw_node->refcount); INIT_HLIST_NODE(&gw_node->list); kref_get(&orig_node->refcount); gw_node->orig_node = orig_node; gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); kref_get(&gw_node->refcount); hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list); bat_priv->gw.generation++; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n", orig_node->orig, ntohl(gateway->bandwidth_down) / 10, ntohl(gateway->bandwidth_down) % 10, ntohl(gateway->bandwidth_up) / 10, ntohl(gateway->bandwidth_up) % 10); /* don't return reference to new gw_node */ batadv_gw_node_put(gw_node); } /** * batadv_gw_node_get() - retrieve gateway node from list of available gateways * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * * Return: gateway node if found or NULL otherwise. */ struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_gw_node *gw_node_tmp, *gw_node = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(gw_node_tmp, &bat_priv->gw.gateway_list, list) { if (gw_node_tmp->orig_node != orig_node) continue; if (!kref_get_unless_zero(&gw_node_tmp->refcount)) continue; gw_node = gw_node_tmp; break; } rcu_read_unlock(); return gw_node; } /** * batadv_gw_node_update() - update list of available gateways with changed * bandwidth information * @bat_priv: the bat priv with all the soft interface information * @orig_node: originator announcing gateway capabilities * @gateway: announced bandwidth information */ void batadv_gw_node_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_tvlv_gateway_data *gateway) { struct batadv_gw_node *gw_node, *curr_gw = NULL; spin_lock_bh(&bat_priv->gw.list_lock); gw_node = batadv_gw_node_get(bat_priv, orig_node); if (!gw_node) { batadv_gw_node_add(bat_priv, orig_node, gateway); spin_unlock_bh(&bat_priv->gw.list_lock); goto out; } spin_unlock_bh(&bat_priv->gw.list_lock); if (gw_node->bandwidth_down == ntohl(gateway->bandwidth_down) && gw_node->bandwidth_up == ntohl(gateway->bandwidth_up)) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Gateway bandwidth of originator %pM changed from %u.%u/%u.%u MBit to %u.%u/%u.%u MBit\n", orig_node->orig, gw_node->bandwidth_down / 10, gw_node->bandwidth_down % 10, gw_node->bandwidth_up / 10, gw_node->bandwidth_up % 10, ntohl(gateway->bandwidth_down) / 10, ntohl(gateway->bandwidth_down) % 10, ntohl(gateway->bandwidth_up) / 10, ntohl(gateway->bandwidth_up) % 10); gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); if (ntohl(gateway->bandwidth_down) == 0) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Gateway %pM removed from gateway list\n", orig_node->orig); /* Note: We don't need a NULL check here, since curr_gw never * gets dereferenced. */ spin_lock_bh(&bat_priv->gw.list_lock); if (!hlist_unhashed(&gw_node->list)) { hlist_del_init_rcu(&gw_node->list); batadv_gw_node_put(gw_node); bat_priv->gw.generation++; } spin_unlock_bh(&bat_priv->gw.list_lock); curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (gw_node == curr_gw) batadv_gw_reselect(bat_priv); batadv_gw_node_put(curr_gw); } out: batadv_gw_node_put(gw_node); } /** * batadv_gw_node_delete() - Remove orig_node from gateway list * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node which is currently in process of being removed */ void batadv_gw_node_delete(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_tvlv_gateway_data gateway; gateway.bandwidth_down = 0; gateway.bandwidth_up = 0; batadv_gw_node_update(bat_priv, orig_node, &gateway); } /** * batadv_gw_node_free() - Free gateway information from soft interface * @bat_priv: the bat priv with all the soft interface information */ void batadv_gw_node_free(struct batadv_priv *bat_priv) { struct batadv_gw_node *gw_node; struct hlist_node *node_tmp; spin_lock_bh(&bat_priv->gw.list_lock); hlist_for_each_entry_safe(gw_node, node_tmp, &bat_priv->gw.gateway_list, list) { hlist_del_init_rcu(&gw_node->list); batadv_gw_node_put(gw_node); bat_priv->gw.generation++; } spin_unlock_bh(&bat_priv->gw.list_lock); } /** * batadv_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options * * Return: Error code, or length of message */ int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; struct net_device *soft_iface; struct batadv_priv *bat_priv; int ret; soft_iface = batadv_netlink_get_softif(cb); if (IS_ERR(soft_iface)) return PTR_ERR(soft_iface); bat_priv = netdev_priv(soft_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } if (!bat_priv->algo_ops->gw.dump) { ret = -EOPNOTSUPP; goto out; } bat_priv->algo_ops->gw.dump(msg, cb, bat_priv); ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(soft_iface); return ret; } /** * batadv_gw_dhcp_recipient_get() - check if a packet is a DHCP message * @skb: the packet to check * @header_len: a pointer to the batman-adv header size * @chaddr: buffer where the client address will be stored. Valid * only if the function returns BATADV_DHCP_TO_CLIENT * * This function may re-allocate the data buffer of the skb passed as argument. * * Return: * - BATADV_DHCP_NO if the packet is not a dhcp message or if there was an error * while parsing it * - BATADV_DHCP_TO_SERVER if this is a message going to the DHCP server * - BATADV_DHCP_TO_CLIENT if this is a message going to a DHCP client */ enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, u8 *chaddr) { enum batadv_dhcp_recipient ret = BATADV_DHCP_NO; struct ethhdr *ethhdr; struct iphdr *iphdr; struct ipv6hdr *ipv6hdr; struct udphdr *udphdr; struct vlan_ethhdr *vhdr; int chaddr_offset; __be16 proto; u8 *p; /* check for ethernet header */ if (!pskb_may_pull(skb, *header_len + ETH_HLEN)) return BATADV_DHCP_NO; ethhdr = eth_hdr(skb); proto = ethhdr->h_proto; *header_len += ETH_HLEN; /* check for initial vlan header */ if (proto == htons(ETH_P_8021Q)) { if (!pskb_may_pull(skb, *header_len + VLAN_HLEN)) return BATADV_DHCP_NO; vhdr = vlan_eth_hdr(skb); proto = vhdr->h_vlan_encapsulated_proto; *header_len += VLAN_HLEN; } /* check for ip header */ switch (proto) { case htons(ETH_P_IP): if (!pskb_may_pull(skb, *header_len + sizeof(*iphdr))) return BATADV_DHCP_NO; iphdr = (struct iphdr *)(skb->data + *header_len); *header_len += iphdr->ihl * 4; /* check for udp header */ if (iphdr->protocol != IPPROTO_UDP) return BATADV_DHCP_NO; break; case htons(ETH_P_IPV6): if (!pskb_may_pull(skb, *header_len + sizeof(*ipv6hdr))) return BATADV_DHCP_NO; ipv6hdr = (struct ipv6hdr *)(skb->data + *header_len); *header_len += sizeof(*ipv6hdr); /* check for udp header */ if (ipv6hdr->nexthdr != IPPROTO_UDP) return BATADV_DHCP_NO; break; default: return BATADV_DHCP_NO; } if (!pskb_may_pull(skb, *header_len + sizeof(*udphdr))) return BATADV_DHCP_NO; udphdr = (struct udphdr *)(skb->data + *header_len); *header_len += sizeof(*udphdr); /* check for bootp port */ switch (proto) { case htons(ETH_P_IP): if (udphdr->dest == htons(67)) ret = BATADV_DHCP_TO_SERVER; else if (udphdr->source == htons(67)) ret = BATADV_DHCP_TO_CLIENT; break; case htons(ETH_P_IPV6): if (udphdr->dest == htons(547)) ret = BATADV_DHCP_TO_SERVER; else if (udphdr->source == htons(547)) ret = BATADV_DHCP_TO_CLIENT; break; } chaddr_offset = *header_len + BATADV_DHCP_CHADDR_OFFSET; /* store the client address if the message is going to a client */ if (ret == BATADV_DHCP_TO_CLIENT) { if (!pskb_may_pull(skb, chaddr_offset + ETH_ALEN)) return BATADV_DHCP_NO; /* check if the DHCP packet carries an Ethernet DHCP */ p = skb->data + *header_len + BATADV_DHCP_HTYPE_OFFSET; if (*p != BATADV_DHCP_HTYPE_ETHERNET) return BATADV_DHCP_NO; /* check if the DHCP packet carries a valid Ethernet address */ p = skb->data + *header_len + BATADV_DHCP_HLEN_OFFSET; if (*p != ETH_ALEN) return BATADV_DHCP_NO; ether_addr_copy(chaddr, skb->data + chaddr_offset); } return ret; } /** * batadv_gw_out_of_range() - check if the dhcp request destination is the best * gateway * @bat_priv: the bat priv with all the soft interface information * @skb: the outgoing packet * * Check if the skb is a DHCP request and if it is sent to the current best GW * server. Due to topology changes it may be the case that the GW server * previously selected is not the best one anymore. * * This call might reallocate skb data. * Must be invoked only when the DHCP packet is going TO a DHCP SERVER. * * Return: true if the packet destination is unicast and it is not the best gw, * false otherwise. */ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_neigh_node *neigh_curr = NULL; struct batadv_neigh_node *neigh_old = NULL; struct batadv_orig_node *orig_dst_node = NULL; struct batadv_gw_node *gw_node = NULL; struct batadv_gw_node *curr_gw = NULL; struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo; struct ethhdr *ethhdr = (struct ethhdr *)skb->data; bool out_of_range = false; u8 curr_tq_avg; unsigned short vid; vid = batadv_get_vid(skb, 0); if (is_multicast_ether_addr(ethhdr->h_dest)) goto out; orig_dst_node = batadv_transtable_search(bat_priv, ethhdr->h_source, ethhdr->h_dest, vid); if (!orig_dst_node) goto out; gw_node = batadv_gw_node_get(bat_priv, orig_dst_node); if (!gw_node) goto out; switch (atomic_read(&bat_priv->gw.mode)) { case BATADV_GW_MODE_SERVER: /* If we are a GW then we are our best GW. We can artificially * set the tq towards ourself as the maximum value */ curr_tq_avg = BATADV_TQ_MAX_VALUE; break; case BATADV_GW_MODE_CLIENT: curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (!curr_gw) goto out; /* packet is going to our gateway */ if (curr_gw->orig_node == orig_dst_node) goto out; /* If the dhcp packet has been sent to a different gw, * we have to evaluate whether the old gw is still * reliable enough */ neigh_curr = batadv_find_router(bat_priv, curr_gw->orig_node, NULL); if (!neigh_curr) goto out; curr_ifinfo = batadv_neigh_ifinfo_get(neigh_curr, BATADV_IF_DEFAULT); if (!curr_ifinfo) goto out; curr_tq_avg = curr_ifinfo->bat_iv.tq_avg; batadv_neigh_ifinfo_put(curr_ifinfo); break; case BATADV_GW_MODE_OFF: default: goto out; } neigh_old = batadv_find_router(bat_priv, orig_dst_node, NULL); if (!neigh_old) goto out; old_ifinfo = batadv_neigh_ifinfo_get(neigh_old, BATADV_IF_DEFAULT); if (!old_ifinfo) goto out; if ((curr_tq_avg - old_ifinfo->bat_iv.tq_avg) > BATADV_GW_THRESHOLD) out_of_range = true; batadv_neigh_ifinfo_put(old_ifinfo); out: batadv_orig_node_put(orig_dst_node); batadv_gw_node_put(curr_gw); batadv_gw_node_put(gw_node); batadv_neigh_node_put(neigh_old); batadv_neigh_node_put(neigh_curr); return out_of_range; } |
15 1 1 15 15 17 2 15 107 108 44 44 42 40 1 3 41 2 19 5 38 91 21 2 69 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 | // SPDX-License-Identifier: GPL-2.0 /* * Functions related to io context handling */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/sched/task.h> #include "blk.h" #include "blk-mq-sched.h" /* * For io context allocations */ static struct kmem_cache *iocontext_cachep; #ifdef CONFIG_BLK_ICQ /** * get_io_context - increment reference count to io_context * @ioc: io_context to get * * Increment reference count to @ioc. */ static void get_io_context(struct io_context *ioc) { BUG_ON(atomic_long_read(&ioc->refcount) <= 0); atomic_long_inc(&ioc->refcount); } /* * Exit an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. */ static void ioc_exit_icq(struct io_cq *icq) { struct elevator_type *et = icq->q->elevator->type; if (icq->flags & ICQ_EXITED) return; if (et->ops.exit_icq) et->ops.exit_icq(icq); icq->flags |= ICQ_EXITED; } static void ioc_exit_icqs(struct io_context *ioc) { struct io_cq *icq; spin_lock_irq(&ioc->lock); hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) ioc_exit_icq(icq); spin_unlock_irq(&ioc->lock); } /* * Release an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. */ static void ioc_destroy_icq(struct io_cq *icq) { struct io_context *ioc = icq->ioc; struct request_queue *q = icq->q; struct elevator_type *et = q->elevator->type; lockdep_assert_held(&ioc->lock); lockdep_assert_held(&q->queue_lock); if (icq->flags & ICQ_DESTROYED) return; radix_tree_delete(&ioc->icq_tree, icq->q->id); hlist_del_init(&icq->ioc_node); list_del_init(&icq->q_node); /* * Both setting lookup hint to and clearing it from @icq are done * under queue_lock. If it's not pointing to @icq now, it never * will. Hint assignment itself can race safely. */ if (rcu_access_pointer(ioc->icq_hint) == icq) rcu_assign_pointer(ioc->icq_hint, NULL); ioc_exit_icq(icq); /* * @icq->q might have gone away by the time RCU callback runs * making it impossible to determine icq_cache. Record it in @icq. */ icq->__rcu_icq_cache = et->icq_cache; icq->flags |= ICQ_DESTROYED; kfree_rcu(icq, __rcu_head); } /* * Slow path for ioc release in put_io_context(). Performs double-lock * dancing to unlink all icq's and then frees ioc. */ static void ioc_release_fn(struct work_struct *work) { struct io_context *ioc = container_of(work, struct io_context, release_work); spin_lock_irq(&ioc->lock); while (!hlist_empty(&ioc->icq_list)) { struct io_cq *icq = hlist_entry(ioc->icq_list.first, struct io_cq, ioc_node); struct request_queue *q = icq->q; if (spin_trylock(&q->queue_lock)) { ioc_destroy_icq(icq); spin_unlock(&q->queue_lock); } else { /* Make sure q and icq cannot be freed. */ rcu_read_lock(); /* Re-acquire the locks in the correct order. */ spin_unlock(&ioc->lock); spin_lock(&q->queue_lock); spin_lock(&ioc->lock); ioc_destroy_icq(icq); spin_unlock(&q->queue_lock); rcu_read_unlock(); } } spin_unlock_irq(&ioc->lock); kmem_cache_free(iocontext_cachep, ioc); } /* * Releasing icqs requires reverse order double locking and we may already be * holding a queue_lock. Do it asynchronously from a workqueue. */ static bool ioc_delay_free(struct io_context *ioc) { unsigned long flags; spin_lock_irqsave(&ioc->lock, flags); if (!hlist_empty(&ioc->icq_list)) { queue_work(system_power_efficient_wq, &ioc->release_work); spin_unlock_irqrestore(&ioc->lock, flags); return true; } spin_unlock_irqrestore(&ioc->lock, flags); return false; } /** * ioc_clear_queue - break any ioc association with the specified queue * @q: request_queue being cleared * * Walk @q->icq_list and exit all io_cq's. */ void ioc_clear_queue(struct request_queue *q) { spin_lock_irq(&q->queue_lock); while (!list_empty(&q->icq_list)) { struct io_cq *icq = list_first_entry(&q->icq_list, struct io_cq, q_node); /* * Other context won't hold ioc lock to wait for queue_lock, see * details in ioc_release_fn(). */ spin_lock(&icq->ioc->lock); ioc_destroy_icq(icq); spin_unlock(&icq->ioc->lock); } spin_unlock_irq(&q->queue_lock); } #else /* CONFIG_BLK_ICQ */ static inline void ioc_exit_icqs(struct io_context *ioc) { } static inline bool ioc_delay_free(struct io_context *ioc) { return false; } #endif /* CONFIG_BLK_ICQ */ /** * put_io_context - put a reference of io_context * @ioc: io_context to put * * Decrement reference count of @ioc and release it if the count reaches * zero. */ void put_io_context(struct io_context *ioc) { BUG_ON(atomic_long_read(&ioc->refcount) <= 0); if (atomic_long_dec_and_test(&ioc->refcount) && !ioc_delay_free(ioc)) kmem_cache_free(iocontext_cachep, ioc); } EXPORT_SYMBOL_GPL(put_io_context); /* Called by the exiting task */ void exit_io_context(struct task_struct *task) { struct io_context *ioc; task_lock(task); ioc = task->io_context; task->io_context = NULL; task_unlock(task); if (atomic_dec_and_test(&ioc->active_ref)) { ioc_exit_icqs(ioc); put_io_context(ioc); } } static struct io_context *alloc_io_context(gfp_t gfp_flags, int node) { struct io_context *ioc; ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, node); if (unlikely(!ioc)) return NULL; atomic_long_set(&ioc->refcount, 1); atomic_set(&ioc->active_ref, 1); #ifdef CONFIG_BLK_ICQ spin_lock_init(&ioc->lock); INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&ioc->icq_list); INIT_WORK(&ioc->release_work, ioc_release_fn); #endif ioc->ioprio = IOPRIO_DEFAULT; return ioc; } int set_task_ioprio(struct task_struct *task, int ioprio) { int err; const struct cred *cred = current_cred(), *tcred; rcu_read_lock(); tcred = __task_cred(task); if (!uid_eq(tcred->uid, cred->euid) && !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { rcu_read_unlock(); return -EPERM; } rcu_read_unlock(); err = security_task_setioprio(task, ioprio); if (err) return err; task_lock(task); if (unlikely(!task->io_context)) { struct io_context *ioc; task_unlock(task); ioc = alloc_io_context(GFP_ATOMIC, NUMA_NO_NODE); if (!ioc) return -ENOMEM; task_lock(task); if (task->flags & PF_EXITING) { kmem_cache_free(iocontext_cachep, ioc); goto out; } if (task->io_context) kmem_cache_free(iocontext_cachep, ioc); else task->io_context = ioc; } task->io_context->ioprio = ioprio; out: task_unlock(task); return 0; } EXPORT_SYMBOL_GPL(set_task_ioprio); int __copy_io(unsigned long clone_flags, struct task_struct *tsk) { struct io_context *ioc = current->io_context; /* * Share io context with parent, if CLONE_IO is set */ if (clone_flags & CLONE_IO) { atomic_inc(&ioc->active_ref); tsk->io_context = ioc; } else if (ioprio_valid(ioc->ioprio)) { tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE); if (!tsk->io_context) return -ENOMEM; tsk->io_context->ioprio = ioc->ioprio; } return 0; } #ifdef CONFIG_BLK_ICQ /** * ioc_lookup_icq - lookup io_cq from ioc * @q: the associated request_queue * * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called * with @q->queue_lock held. */ struct io_cq *ioc_lookup_icq(struct request_queue *q) { struct io_context *ioc = current->io_context; struct io_cq *icq; lockdep_assert_held(&q->queue_lock); /* * icq's are indexed from @ioc using radix tree and hint pointer, * both of which are protected with RCU. All removals are done * holding both q and ioc locks, and we're holding q lock - if we * find a icq which points to us, it's guaranteed to be valid. */ rcu_read_lock(); icq = rcu_dereference(ioc->icq_hint); if (icq && icq->q == q) goto out; icq = radix_tree_lookup(&ioc->icq_tree, q->id); if (icq && icq->q == q) rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */ else icq = NULL; out: rcu_read_unlock(); return icq; } EXPORT_SYMBOL(ioc_lookup_icq); /** * ioc_create_icq - create and link io_cq * @q: request_queue of interest * * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they * will be created using @gfp_mask. * * The caller is responsible for ensuring @ioc won't go away and @q is * alive and will stay alive until this function returns. */ static struct io_cq *ioc_create_icq(struct request_queue *q) { struct io_context *ioc = current->io_context; struct elevator_type *et = q->elevator->type; struct io_cq *icq; /* allocate stuff */ icq = kmem_cache_alloc_node(et->icq_cache, GFP_ATOMIC | __GFP_ZERO, q->node); if (!icq) return NULL; if (radix_tree_maybe_preload(GFP_ATOMIC) < 0) { kmem_cache_free(et->icq_cache, icq); return NULL; } icq->ioc = ioc; icq->q = q; INIT_LIST_HEAD(&icq->q_node); INIT_HLIST_NODE(&icq->ioc_node); /* lock both q and ioc and try to link @icq */ spin_lock_irq(&q->queue_lock); spin_lock(&ioc->lock); if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { hlist_add_head(&icq->ioc_node, &ioc->icq_list); list_add(&icq->q_node, &q->icq_list); if (et->ops.init_icq) et->ops.init_icq(icq); } else { kmem_cache_free(et->icq_cache, icq); icq = ioc_lookup_icq(q); if (!icq) printk(KERN_ERR "cfq: icq link failed!\n"); } spin_unlock(&ioc->lock); spin_unlock_irq(&q->queue_lock); radix_tree_preload_end(); return icq; } struct io_cq *ioc_find_get_icq(struct request_queue *q) { struct io_context *ioc = current->io_context; struct io_cq *icq = NULL; if (unlikely(!ioc)) { ioc = alloc_io_context(GFP_ATOMIC, q->node); if (!ioc) return NULL; task_lock(current); if (current->io_context) { kmem_cache_free(iocontext_cachep, ioc); ioc = current->io_context; } else { current->io_context = ioc; } get_io_context(ioc); task_unlock(current); } else { get_io_context(ioc); spin_lock_irq(&q->queue_lock); icq = ioc_lookup_icq(q); spin_unlock_irq(&q->queue_lock); } if (!icq) { icq = ioc_create_icq(q); if (!icq) { put_io_context(ioc); return NULL; } } return icq; } EXPORT_SYMBOL_GPL(ioc_find_get_icq); #endif /* CONFIG_BLK_ICQ */ static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", sizeof(struct io_context), 0, SLAB_PANIC, NULL); return 0; } subsys_initcall(blk_ioc_init); |
79 8 1976 1987 1987 3266 11 3266 3266 2227 28 1417 99 35 391 50 28 28 11 341 11 177 164 13 1555 30 707 5 37 2 15 13 1170 874 123 8034 8098 4 4 10 329 353 876 353 858 992 2 1773 36610 1181 5 1809 3206 11 2121 62 2322 25 25 144 6049 1540 1048 72 2500 2737 2731 2735 814 282 513 552 477 552 63 4 402 4 218 202 9246 2093 571 29 6 186 533 3634 306 312 221 92 52 42 5 12 141 3 14 1528 4 80 350 897 1727 346 1204 28 13567 13529 219 992 3132 3186 3187 3190 824 150 993 993 151 177 992 3132 23 68 11 2 11 2 772 4 8445 22 55 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MM_H #define _LINUX_MM_H #include <linux/errno.h> #include <linux/mmdebug.h> #include <linux/gfp.h> #include <linux/pgalloc_tag.h> #include <linux/bug.h> #include <linux/list.h> #include <linux/mmzone.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/debug_locks.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/range.h> #include <linux/pfn.h> #include <linux/percpu-refcount.h> #include <linux/bit_spinlock.h> #include <linux/shrinker.h> #include <linux/resource.h> #include <linux/page_ext.h> #include <linux/err.h> #include <linux/page-flags.h> #include <linux/page_ref.h> #include <linux/overflow.h> #include <linux/sizes.h> #include <linux/sched.h> #include <linux/pgtable.h> #include <linux/kasan.h> #include <linux/memremap.h> #include <linux/slab.h> #include <linux/cacheinfo.h> struct mempolicy; struct anon_vma; struct anon_vma_chain; struct user_struct; struct pt_regs; struct folio_batch; extern int sysctl_page_lock_unfairness; void mm_core_init(void); void init_mm_internals(void); #ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; static inline void set_max_mapnr(unsigned long limit) { max_mapnr = limit; } #else static inline void set_max_mapnr(unsigned long limit) { } #endif extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { return (unsigned long)atomic_long_read(&_totalram_pages); } static inline void totalram_pages_inc(void) { atomic_long_inc(&_totalram_pages); } static inline void totalram_pages_dec(void) { atomic_long_dec(&_totalram_pages); } static inline void totalram_pages_add(long count) { atomic_long_add(count, &_totalram_pages); } extern void * high_memory; extern int page_cluster; extern const int page_cluster_max; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; #else #define sysctl_legacy_va_layout 0 #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS extern const int mmap_rnd_bits_min; extern int mmap_rnd_bits_max __ro_after_init; extern int mmap_rnd_bits __read_mostly; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS extern const int mmap_rnd_compat_bits_min; extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif #ifndef DIRECT_MAP_PHYSMEM_END # ifdef MAX_PHYSMEM_BITS # define DIRECT_MAP_PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) # else # define DIRECT_MAP_PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) # endif #endif #include <asm/page.h> #include <asm/processor.h> #ifndef __pa_symbol #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) #endif #ifndef page_to_virt #define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x))) #endif #ifndef lm_alias #define lm_alias(x) __va(__pa_symbol(x)) #endif /* * To prevent common memory management code establishing * a zero page mapping on a read fault. * This macro should be defined within <asm/pgtable.h>. * s390 does this to prevent multiplexing of hardware bits * related to the physical page in case of virtualization. */ #ifndef mm_forbids_zeropage #define mm_forbids_zeropage(X) (0) #endif /* * On some architectures it is expensive to call memset() for small sizes. * If an architecture decides to implement their own version of * mm_zero_struct_page they should wrap the defines below in a #ifndef and * define their own version of this macro in <asm/pgtable.h> */ #if BITS_PER_LONG == 64 /* This function must be updated when the size of struct page grows above 96 * or reduces below 56. The idea that compiler optimizes out switch() * statement, and only leaves move/store instructions. Also the compiler can * combine write statements if they are both assignments and can be reordered, * this can result in several of the writes here being dropped. */ #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) static inline void __mm_zero_struct_page(struct page *page) { unsigned long *_pp = (void *)page; /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */ BUILD_BUG_ON(sizeof(struct page) & 7); BUILD_BUG_ON(sizeof(struct page) < 56); BUILD_BUG_ON(sizeof(struct page) > 96); switch (sizeof(struct page)) { case 96: _pp[11] = 0; fallthrough; case 88: _pp[10] = 0; fallthrough; case 80: _pp[9] = 0; fallthrough; case 72: _pp[8] = 0; fallthrough; case 64: _pp[7] = 0; fallthrough; case 56: _pp[6] = 0; _pp[5] = 0; _pp[4] = 0; _pp[3] = 0; _pp[2] = 0; _pp[1] = 0; _pp[0] = 0; } } #else #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) #endif /* * Default maximum number of active map areas, this limits the number of vmas * per mm struct. Users can overwrite this number by sysctl but there is a * problem. * * When a program's coredump is generated as ELF format, a section is created * per a vma. In ELF, the number of sections is represented in unsigned short. * This means the number of sections should be smaller than 65535 at coredump. * Because the kernel adds some informative sections to a image of program at * generating coredump, we need some margin. The number of extra sections is * 1-3 now and depends on arch. We use "5" as safe margin, here. * * ELF extended numbering allows more than 65535 sections, so 16-bit bound is * not a hard limit any more. Although some userspace tools can be surprised by * that. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) extern int sysctl_max_map_count; extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; int overcommit_ratio_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_kbytes_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_policy_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) #define folio_page_idx(folio, p) (page_to_pfn(p) - folio_pfn(folio)) #else #define nth_page(page,n) ((page) + (n)) #define folio_page_idx(folio, p) ((p) - &(folio)->page) #endif /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) /* to align the pointer to the (prev) page boundary */ #define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE) /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) static inline struct folio *lru_to_folio(struct list_head *head) { return list_entry((head)->prev, struct folio, lru); } void setup_initial_init_mm(void *start_code, void *end_code, void *end_data, void *brk); /* * Linux kernel virtual memory manager primitives. * The idea being to have a "virtual" mm in the same way * we have a virtual fs - giving a cleaner interface to the * mm details, and allowing different kinds of memory mappings * (from shared memory to executable loading to arbitrary * mmap() functions). */ struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); /* Use only if VMA has no other users */ void __vm_area_free(struct vm_area_struct *vma); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif /* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h */ #define VM_NONE 0x00000000 #define VM_READ 0x00000001 /* currently active flags */ #define VM_WRITE 0x00000002 #define VM_EXEC 0x00000004 #define VM_SHARED 0x00000008 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ #define VM_MAYWRITE 0x00000020 #define VM_MAYEXEC 0x00000040 #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #ifdef CONFIG_MMU #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #else /* CONFIG_MMU */ #define VM_MAYOVERLAY 0x00000200 /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ #define VM_UFFD_MISSING 0 #endif /* CONFIG_MMU */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ /* Used by sys_madvise() */ #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ #ifdef CONFIG_MEM_SOFT_DIRTY # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ #else # define VM_SOFTDIRTY 0 #endif #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) #define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 # define VM_PKEY_BIT1 VM_HIGH_ARCH_1 # define VM_PKEY_BIT2 VM_HIGH_ARCH_2 #if CONFIG_ARCH_PKEY_BITS > 3 # define VM_PKEY_BIT3 VM_HIGH_ARCH_3 #else # define VM_PKEY_BIT3 0 #endif #if CONFIG_ARCH_PKEY_BITS > 4 # define VM_PKEY_BIT4 VM_HIGH_ARCH_4 #else # define VM_PKEY_BIT4 0 #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ #ifdef CONFIG_X86_USER_SHADOW_STACK /* * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of * support core mm. * * These VMAs will get a single end guard page. This helps userspace protect * itself from attacks. A single page is enough for current shadow stack archs * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c * for more details on the guard size. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_5 #endif #if defined(CONFIG_ARM64_GCS) /* * arm64's Guarded Control Stack implements similar functionality and * has similar constraints to shadow stacks. */ # define VM_SHADOW_STACK VM_HIGH_ARCH_6 #endif #ifndef VM_SHADOW_STACK # define VM_SHADOW_STACK VM_NONE #endif #if defined(CONFIG_X86) # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ #elif defined(CONFIG_PPC64) # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ #elif defined(CONFIG_PARISC) # define VM_GROWSUP VM_ARCH_1 #elif defined(CONFIG_SPARC64) # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ # define VM_ARCH_CLEAR VM_SPARC_ADI #elif defined(CONFIG_ARM64) # define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ # define VM_ARCH_CLEAR VM_ARM64_BTI #elif !defined(CONFIG_MMU) # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ #endif #if defined(CONFIG_ARM64_MTE) # define VM_MTE VM_HIGH_ARCH_4 /* Use Tagged memory for access control */ # define VM_MTE_ALLOWED VM_HIGH_ARCH_5 /* Tagged memory permitted */ #else # define VM_MTE VM_NONE # define VM_MTE_ALLOWED VM_NONE #endif #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif #ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR # define VM_UFFD_MINOR_BIT 38 # define VM_UFFD_MINOR BIT(VM_UFFD_MINOR_BIT) /* UFFD minor faults */ #else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ # define VM_UFFD_MINOR VM_NONE #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */ /* * This flag is used to connect VFIO to arch specific KVM code. It * indicates that the memory under this VMA is safe for use with any * non-cachable memory type inside KVM. Some VFIO devices, on some * platforms, are thought to be unsafe and can cause machine crashes * if KVM does not lock down the memory type. */ #ifdef CONFIG_64BIT #define VM_ALLOW_ANY_UNCACHED_BIT 39 #define VM_ALLOW_ANY_UNCACHED BIT(VM_ALLOW_ANY_UNCACHED_BIT) #else #define VM_ALLOW_ANY_UNCACHED VM_NONE #endif #ifdef CONFIG_64BIT #define VM_DROPPABLE_BIT 40 #define VM_DROPPABLE BIT(VM_DROPPABLE_BIT) #elif defined(CONFIG_PPC32) #define VM_DROPPABLE VM_ARCH_1 #else #define VM_DROPPABLE VM_NONE #endif #ifdef CONFIG_64BIT /* VM is sealed, in vm_flags */ #define VM_SEALED _BITUL(63) #endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) #define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) /* Common data flag combinations */ #define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC #endif #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif #define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) #ifdef CONFIG_STACK_GROWSUP #define VM_STACK VM_GROWSUP #define VM_STACK_EARLY VM_GROWSDOWN #else #define VM_STACK VM_GROWSDOWN #define VM_STACK_EARLY 0 #endif #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) /* * Special vmas that are non-mergable, non-mlock()able. */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE /* This mask represents all the VMA flag bits used by mlock */ #define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ /* * The default fault flags that should be used by most of the * arch-specific page fault handlers. */ #define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \ FAULT_FLAG_KILLABLE | \ FAULT_FLAG_INTERRUPTIBLE) /** * fault_flag_allow_retry_first - check ALLOW_RETRY the first time * @flags: Fault flags. * * This is mostly used for places where we want to try to avoid taking * the mmap_lock for too long a time when waiting for another condition * to change, in which case we can try to be polite to release the * mmap_lock in the first round to avoid potential starvation of other * processes that would also want the mmap_lock. * * Return: true if the page fault allows retry and this is the first * attempt of the fault handling; false otherwise. */ static inline bool fault_flag_allow_retry_first(enum fault_flag flags) { return (flags & FAULT_FLAG_ALLOW_RETRY) && (!(flags & FAULT_FLAG_TRIED)); } #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ { FAULT_FLAG_ALLOW_RETRY, "ALLOW_RETRY" }, \ { FAULT_FLAG_RETRY_NOWAIT, "RETRY_NOWAIT" }, \ { FAULT_FLAG_KILLABLE, "KILLABLE" }, \ { FAULT_FLAG_TRIED, "TRIED" }, \ { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" } /* * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * * MM layer fills up gfp_mask for page allocations but fault handler might * alter it if its implementation requires a different allocation context. * * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { const struct { struct vm_area_struct *vma; /* Target VMA */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address - masked */ unsigned long real_address; /* Faulting virtual address - unmasked */ }; enum fault_flag flags; /* FAULT_FLAG_xxx flags * XXX: should really be 'const' */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching * the 'address' */ union { pte_t orig_pte; /* Value of PTE at the time of fault */ pmd_t orig_pmd; /* Value of PMD at the time of fault, * used by PMD fault only. */ }; struct page *cow_page; /* Page handler may use for COW fault */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by * VM_FAULT_ERROR). */ /* These three entries are valid only while holding ptl lock */ pte_t *pte; /* Pointer to pte entry matching * the 'address'. NULL if the page * table hasn't been allocated. */ spinlock_t *ptl; /* Page table lock. * Protects pte page table if 'pte' * is not NULL, otherwise pmd. */ pgtable_t prealloc_pte; /* Pre-allocated pte page table. * vm_ops->map_pages() sets up a page * table from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context. */ }; /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer * to the functions called when a no-page or a wp-page exception occurs. */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); /** * @close: Called when the VMA is being removed from the MM. * Context: User context. May sleep. Caller holds mmap_lock. */ void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); int (*mremap)(struct vm_area_struct *area); /* * Called by mprotect() to make driver-specific permission * checks before mprotect() is finalised. The VMA must not * be modified. Returns 0 if mprotect() can proceed. */ int (*mprotect)(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs. See also generic_access_phys() for a generic * implementation useful for any iomem mapping. */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); /* Called by the /proc/PID/maps code to ask the vma whether it * has a special name. Returning non-NULL will also cause this * vma to be dumped unconditionally. */ const char *(*name)(struct vm_area_struct *vma); #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy * to hold the policy upon return. Caller should pass NULL @new to * remove a policy and fall back to surrounding context--i.e. do not * install a MPOL_DEFAULT policy, nor the task or system default * mempolicy. */ int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); /* * get_policy() op must add reference [mpol_get()] to any policy at * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy. */ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr, pgoff_t *ilx); #endif /* * Called by vm_normal_page() for special PTEs to find the * page for @addr. This is useful if the default behavior * (using pte_page()) would not find the correct page. */ struct page *(*find_special_page)(struct vm_area_struct *vma, unsigned long addr); }; #ifdef CONFIG_NUMA_BALANCING static inline void vma_numab_state_init(struct vm_area_struct *vma) { vma->numab_state = NULL; } static inline void vma_numab_state_free(struct vm_area_struct *vma) { kfree(vma->numab_state); } #else static inline void vma_numab_state_init(struct vm_area_struct *vma) {} static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_PER_VMA_LOCK /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. */ static inline bool vma_start_read(struct vm_area_struct *vma) { /* * Check before locking. A race might cause false locked result. * We can use READ_ONCE() for the mm_lock_seq here, and don't need * ACQUIRE semantics, because this is just a lockless check whose result * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) return false; /* * Overflow might produce false locked result. * False unlocked result is impossible because we modify and check * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq * modification invalidates all existing locks. * * We must use ACQUIRE semantics for the mm_lock_seq so that if we are * racing with vma_end_write_all(), we only start reading from the VMA * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } return true; } static inline void vma_end_read(struct vm_area_struct *vma) { rcu_read_lock(); /* keeps vma alive till the end of up_read */ up_read(&vma->vm_lock->lock); rcu_read_unlock(); } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); /* * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } /* * Begin writing to a VMA. * Exclude concurrent readers under the per-VMA lock until the currently * write-locked mmap_lock is dropped or downgraded. */ static inline void vma_start_write(struct vm_area_struct *vma) { unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; down_write(&vma->vm_lock->lock); /* * We should use WRITE_ONCE() here because we can have concurrent reads * from the early lockless pessimistic check in vma_start_read(). * We don't really care about the correctness of that early check, but * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); up_write(&vma->vm_lock->lock); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) { unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } static inline void vma_assert_locked(struct vm_area_struct *vma) { if (!rwsem_is_locked(&vma->vm_lock->lock)) vma_assert_write_locked(vma); } static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) { /* When detaching vma should be write-locked */ if (detached) vma_assert_write_locked(vma); vma->detached = detached; } static inline void release_fault_lock(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_end_read(vmf->vma); else mmap_read_unlock(vmf->vma->vm_mm); } static inline void assert_fault_locked(struct vm_fault *vmf) { if (vmf->flags & FAULT_FLAG_VMA_LOCK) vma_assert_locked(vmf->vma); else mmap_assert_locked(vmf->vma->vm_mm); } struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); #else /* CONFIG_PER_VMA_LOCK */ static inline bool vma_start_read(struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) { mmap_assert_write_locked(vma->vm_mm); } static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) {} static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address) { return NULL; } static inline void vma_assert_locked(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); } static inline void release_fault_lock(struct vm_fault *vmf) { mmap_read_unlock(vmf->vma->vm_mm); } static inline void assert_fault_locked(struct vm_fault *vmf) { mmap_assert_locked(vmf->vma->vm_mm); } #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; /* * WARNING: vma_init does not initialize vma->vm_lock. * Use vm_area_alloc()/vm_area_free() if vma needs locking. */ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); vma_mark_detached(vma, false); vma_numab_state_init(vma); } /* Use when VMA is not part of the VMA tree and needs no locking */ static inline void vm_flags_init(struct vm_area_struct *vma, vm_flags_t flags) { ACCESS_PRIVATE(vma, __vm_flags) = flags; } /* * Use when VMA is part of the VMA tree and modifications need coordination * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and * it should be locked explicitly beforehand. */ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); vm_flags_init(vma, flags); } static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { vma_assert_write_locked(vma); WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); } static inline void vm_flags_set(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) |= flags; } static inline void vm_flags_clear(struct vm_area_struct *vma, vm_flags_t flags) { vma_start_write(vma); ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; } /* * Use only if VMA is not part of the VMA tree or has no other users and * therefore needs no locking. */ static inline void __vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { vm_flags_init(vma, (vma->vm_flags | set) & ~clear); } /* * Use only when the order of set/clear operations is unimportant, otherwise * use vm_flags_{set|clear} explicitly. */ static inline void vm_flags_mod(struct vm_area_struct *vma, vm_flags_t set, vm_flags_t clear) { vma_start_write(vma); __vm_flags_mod(vma, set, clear); } static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; } static inline bool vma_is_anonymous(struct vm_area_struct *vma) { return !vma->vm_ops; } /* * Indicate if the VMA is a heap for the given task; for * /proc/PID/maps that is the heap of the main task. */ static inline bool vma_is_initial_heap(const struct vm_area_struct *vma) { return vma->vm_start < vma->vm_mm->brk && vma->vm_end > vma->vm_mm->start_brk; } /* * Indicate if the VMA is a stack for the given task; for * /proc/PID/maps that is the stack of the main task. */ static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) { /* * We make no effort to guess what a given thread considers to be * its "stack". It's not even well-defined for programs written * languages like Go. */ return vma->vm_start <= vma->vm_mm->start_stack && vma->vm_end >= vma->vm_mm->start_stack; } static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); if (!maybe_stack) return false; if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == VM_STACK_INCOMPLETE_SETUP) return true; return false; } static inline bool vma_is_foreign(struct vm_area_struct *vma) { if (!current->mm) return true; if (current->mm != vma->vm_mm) return true; return false; } static inline bool vma_is_accessible(struct vm_area_struct *vma) { return vma->vm_flags & VM_ACCESS_FLAGS; } static inline bool is_shared_maywrite(vm_flags_t vm_flags) { return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == (VM_SHARED | VM_MAYWRITE); } static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) { return is_shared_maywrite(vma->vm_flags); } static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { return mas_find(&vmi->mas, max - 1); } static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) { /* * Uses mas_find() to get the first VMA when the iterator starts. * Calling mas_next() could skip the first entry. */ return mas_find(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi) { return mas_next_range(&vmi->mas, ULONG_MAX); } static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) { return mas_prev(&vmi->mas, 0); } static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, unsigned long start, unsigned long end, gfp_t gfp) { __mas_set_range(&vmi->mas, start, end - 1); mas_store_gfp(&vmi->mas, NULL, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; return 0; } /* Free any unused preallocations */ static inline void vma_iter_free(struct vma_iterator *vmi) { mas_destroy(&vmi->mas); } static inline int vma_iter_bulk_store(struct vma_iterator *vmi, struct vm_area_struct *vma) { vmi->mas.index = vma->vm_start; vmi->mas.last = vma->vm_end - 1; mas_store(&vmi->mas, vma); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; return 0; } static inline void vma_iter_invalidate(struct vma_iterator *vmi) { mas_pause(&vmi->mas); } static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr) { mas_set(&vmi->mas, addr); } #define for_each_vma(__vmi, __vma) \ while (((__vma) = vma_next(&(__vmi))) != NULL) /* The MM code likes to work with exclusive end addresses */ #define for_each_vma_range(__vmi, __vma, __end) \ while (((__vma) = vma_find(&(__vmi), (__end))) != NULL) #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow * paths in userfault. */ bool vma_is_shmem(struct vm_area_struct *vma); bool vma_is_anon_shmem(struct vm_area_struct *vma); #else static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } struct mmu_gather; struct inode; /* * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be * prepared to handle wild return values. For example, PG_head may be * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples. */ static inline unsigned int compound_order(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 0; return folio->_flags_1 & 0xff; } /** * folio_order - The allocation order of a folio. * @folio: The folio. * * A folio is composed of 2^order pages. See get_order() for the definition * of order. * * Return: The order of the folio. */ static inline unsigned int folio_order(const struct folio *folio) { if (!folio_test_large(folio)) return 0; return folio->_flags_1 & 0xff; } #include <linux/huge_mm.h> /* * Methods to modify the page usage count. * * What counts for a page usage: * - cache mapping (page->mapping) * - private data (page->private) * - page mapped in a task's page tables, each mapping * is counted separately * * Also, many kernel routines increase the page count before a critical * routine so they can be sure the page doesn't go away from under them. */ /* * Drop a ref, return true if the refcount fell to zero (the page has no users) */ static inline int put_page_testzero(struct page *page) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); return page_ref_dec_and_test(page); } static inline int folio_put_testzero(struct folio *folio) { return put_page_testzero(&folio->page); } /* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. * This can be called when MMU is off so it must not access * any of the virtual mappings. */ static inline bool get_page_unless_zero(struct page *page) { return page_ref_add_unless(page, 1, 0); } static inline struct folio *folio_get_nontail_page(struct page *page) { if (unlikely(!get_page_unless_zero(page))) return NULL; return (struct folio *)page; } extern int page_is_ram(unsigned long pfn); enum { REGION_INTERSECTS, REGION_DISJOINT, REGION_MIXED, }; int region_intersects(resource_size_t offset, size_t size, unsigned long flags, unsigned long desc); /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); /* * Determine if an address is within the vmalloc range * * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ #ifdef CONFIG_MMU extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); #else static inline bool is_vmalloc_addr(const void *x) { return false; } static inline int is_vmalloc_or_module_addr(const void *x) { return 0; } #endif /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for * debugging purposes or implementation of other core folio_*() primitives. */ static inline int folio_entire_mapcount(const struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); return atomic_read(&folio->_entire_mapcount) + 1; } static inline int folio_large_mapcount(const struct folio *folio) { VM_WARN_ON_FOLIO(!folio_test_large(folio), folio); return atomic_read(&folio->_large_mapcount) + 1; } /** * folio_mapcount() - Number of mappings of this folio. * @folio: The folio. * * The folio mapcount corresponds to the number of present user page table * entries that reference any part of a folio. Each such present user page * table entry must be paired with exactly on folio reference. * * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts * exactly once. * * For hugetlb folios, each abstracted "hugetlb" user page table entry that * references the entire folio counts exactly once, even when such special * page table entries are comprised of multiple ordinary page table entries. * * Will report 0 for pages which cannot be mapped into userspace, such as * slab, page tables and similar. * * Return: The number of times this folio is mapped. */ static inline int folio_mapcount(const struct folio *folio) { int mapcount; if (likely(!folio_test_large(folio))) { mapcount = atomic_read(&folio->_mapcount) + 1; if (page_mapcount_is_type(mapcount)) mapcount = 0; return mapcount; } return folio_large_mapcount(folio); } /** * folio_mapped - Is this folio mapped into userspace? * @folio: The folio. * * Return: True if any page in this folio is referenced by user page tables. */ static inline bool folio_mapped(const struct folio *folio) { return folio_mapcount(folio) >= 1; } /* * Return true if this page is mapped into pagetables. * For compound page it returns true if any sub-page of compound page is mapped, * even if this particular sub-page is not itself mapped by any PTE or PMD. */ static inline bool page_mapped(const struct page *page) { return folio_mapped(page_folio(page)); } static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); return compound_head(page); } static inline struct folio *virt_to_folio(const void *x) { struct page *page = virt_to_page(x); return page_folio(page); } void __folio_put(struct folio *folio); void split_page(struct page *page, unsigned int order); void folio_copy(struct folio *dst, struct folio *src); int folio_mc_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { return PAGE_SIZE << compound_order(page); } /* Returns the number of bits needed for the number of bytes in a page */ static inline unsigned int page_shift(struct page *page) { return PAGE_SHIFT + compound_order(page); } /** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page. */ static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return compound_order(page); } /** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. * * Return: Number of bytes in this page. */ static inline unsigned long thp_size(struct page *page) { return PAGE_SIZE << thp_order(page); } #ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when * servicing faults for write access. In the normal case, do always want * pte_mkwrite. But get_user_pages can cause write faults for mappings * that do not have writing enabled, when used by access_process_vm. */ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pte = pte_mkwrite(pte, vma); return pte; } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr); vm_fault_t finish_fault(struct vm_fault *vmf); #endif /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of * zeroes, and text pages of executables and shared libraries have * only one copy in memory, at most, normally. * * For the non-reserved pages, page_count(page) denotes a reference count. * page_count() == 0 means the page is free. page->lru is then used for * freelist management in the buddy allocator. * page_count() > 0 means the page has been allocated. * * Pages are allocated by the slab allocator in order to provide memory * to kmalloc and kmem_cache_alloc. In this case, the management of the * page, and the fields in 'struct page' are the responsibility of mm/slab.c * unless a particular usage is carefully commented. (the responsibility of * freeing the kmalloc memory is the caller's, of course). * * A page may be used by anyone else who does a __get_free_page(). * In this case, page_count still tracks the references, and should only * be used through the normal accessor functions. The top bits of page->flags * and page->virtual store page management information, but all other fields * are unused and could be used privately, carefully. The management of this * page is the responsibility of the one who allocated it, and those who have * subsequently been given references to it. * * The other pages (we may call them "pagecache pages") are completely * managed by the Linux memory manager: I/O, buffers, swapping etc. * The following discussion applies only to them. * * A pagecache page contains an opaque `private' member, which belongs to the * page's address_space. Usually, this is the address of a circular list of * the page's disk buffers. PG_private must be set to tell the VM to call * into the filesystem to release these pages. * * A page may belong to an inode's memory mapping. In this case, page->mapping * is the pointer to the inode, and page->index is the file offset of the page, * in units of PAGE_SIZE. * * If pagecache pages are not associated with an inode, they are said to be * anonymous pages. These may become associated with the swapcache, and in that * case PG_swapcache is set, and page->private is an offset into the swapcache. * * In either case (swapcache or inode backed), the pagecache itself holds one * reference to the page. Setting PG_private should also increment the * refcount. The each user mapping also has a reference to the page. * * The pagecache pages are stored in a per-mapping radix tree, which is * rooted at mapping->i_pages, and indexed by offset. * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space * lists, we instead now tag pages as dirty/writeback in the radix tree. * * All pagecache pages may be subject to I/O: * - inode pages may need to be read from disk, * - inode pages which have been modified and are MAP_SHARED may need * to be written back to the inode on disk, * - anonymous pages (including MAP_PRIVATE file mappings) which have been * modified may need to be swapped out to swap space and (later) to be read * back into memory. */ #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX) DECLARE_STATIC_KEY_FALSE(devmap_managed_key); bool __put_devmap_managed_folio_refs(struct folio *folio, int refs); static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) { if (!static_branch_unlikely(&devmap_managed_key)) return false; if (!folio_is_zone_device(folio)) return false; return __put_devmap_managed_folio_refs(folio, refs); } #else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ static inline bool put_devmap_managed_folio_refs(struct folio *folio, int refs) { return false; } #endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */ /* 127: arbitrary random number, small enough to assemble well */ #define folio_ref_zero_or_close_to_overflow(folio) \ ((unsigned int) folio_ref_count(folio) + 127u <= 127u) /** * folio_get - Increment the reference count on a folio. * @folio: The folio. * * Context: May be called in any context, as long as you know that * you have a refcount on the folio. If you do not already have one, * folio_try_get() may be the right interface for you to use. */ static inline void folio_get(struct folio *folio) { VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio); folio_ref_inc(folio); } static inline void get_page(struct page *page) { folio_get(page_folio(page)); } static inline __must_check bool try_get_page(struct page *page) { page = compound_head(page); if (WARN_ON_ONCE(page_ref_count(page) <= 0)) return false; page_ref_inc(page); return true; } /** * folio_put - Decrement the reference count on a folio. * @folio: The folio. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put() unless you can be sure that it wasn't the * last reference. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folio_put(struct folio *folio) { if (folio_put_testzero(folio)) __folio_put(folio); } /** * folio_put_refs - Reduce the reference count on a folio. * @folio: The folio. * @refs: The amount to subtract from the folio's reference count. * * If the folio's reference count reaches zero, the memory will be * released back to the page allocator and may be used by another * allocation immediately. Do not access the memory or the struct folio * after calling folio_put_refs() unless you can be sure that these weren't * the last references. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folio_put_refs(struct folio *folio, int refs) { if (folio_ref_sub_and_test(folio, refs)) __folio_put(folio); } void folios_put_refs(struct folio_batch *folios, unsigned int *refs); /* * union release_pages_arg - an array of pages or folios * * release_pages() releases a simple array of multiple pages, and * accepts various different forms of said page array: either * a regular old boring array of pages, an array of folios, or * an array of encoded page pointers. * * The transparent union syntax for this kind of "any of these * argument types" is all kinds of ugly, so look away. */ typedef union { struct page **pages; struct folio **folios; struct encoded_page **encoded_pages; } release_pages_arg __attribute__ ((__transparent_union__)); void release_pages(release_pages_arg, int nr); /** * folios_put - Decrement the reference count on an array of folios. * @folios: The folios. * * Like folio_put(), but for a batch of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which need * to be taken if the folios are freed. The folios batch is returned * empty and ready to be reused for another batch; there is no need to * reinitialise it. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ static inline void folios_put(struct folio_batch *folios) { folios_put_refs(folios, NULL); } static inline void put_page(struct page *page) { struct folio *folio = page_folio(page); /* * For some devmap managed pages we need to catch refcount transition * from 2 to 1: */ if (put_devmap_managed_folio_refs(folio, 1)) return; folio_put(folio); } /* * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload * the page's refcount so that two separate items are tracked: the original page * reference count, and also a new count of how many pin_user_pages() calls were * made against the page. ("gup-pinned" is another term for the latter). * * With this scheme, pin_user_pages() becomes special: such pages are marked as * distinct from normal pages. As such, the unpin_user_page() call (and its * variants) must be used in order to release gup-pinned pages. * * Choice of value: * * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference * counts with respect to pin_user_pages() and unpin_user_page() becomes * simpler, due to the fact that adding an even power of two to the page * refcount has the effect of using only the upper N bits, for the code that * counts up using the bias value. This means that the lower bits are left for * the exclusive use of the original code that increments and decrements by one * (or at least, by much smaller values than the bias value). * * Of course, once the lower bits overflow into the upper bits (and this is * OK, because subtraction recovers the original values), then visual inspection * no longer suffices to directly view the separate counts. However, for normal * applications that don't have huge page reference counts, this won't be an * issue. * * Locking: the lockless algorithm described in folio_try_get_rcu() * provides safe operation for get_user_pages(), folio_mkclean() and * other calls that race to set up page table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) void unpin_user_page(struct page *page); void unpin_folio(struct folio *folio); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); void unpin_user_folio(struct folio *folio, unsigned long npages); void unpin_folios(struct folio **folios, unsigned long nfolios); static inline bool is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } #ifndef CONFIG_MMU static inline bool is_nommu_shared_mapping(vm_flags_t flags) { /* * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of * a file mapping. R/O MAP_PRIVATE mappings might still modify * underlying memory if ptrace is active, so this is only possible if * ptrace does not apply. Note that there is no mprotect() to upgrade * write permissions later. */ return flags & (VM_MAYSHARE | VM_MAYOVERLAY); } #endif #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif /* * The identification function is mainly used by the buddy allocator for * determining if two pages could be buddies. We are not really identifying * the zone since we could be using the section number id if we do not have * node id available in page flags. * We only guarantee that it will return the same value for two combinable * pages in a zone. */ static inline int page_zone_id(struct page *page) { return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; } #ifdef NODE_NOT_IN_PAGE_FLAGS int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif static inline int folio_nid(const struct folio *folio) { return page_to_nid(&folio->page); } #ifdef CONFIG_NUMA_BALANCING /* page access time bits needs to hold at least 4 seconds */ #define PAGE_ACCESS_TIME_MIN_BITS 12 #if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS #define PAGE_ACCESS_TIME_BUCKETS \ (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT) #else #define PAGE_ACCESS_TIME_BUCKETS 0 #endif #define PAGE_ACCESS_TIME_MASK \ (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS) static inline int cpu_pid_to_cpupid(int cpu, int pid) { return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } static inline int cpupid_to_pid(int cpupid) { return cpupid & LAST__PID_MASK; } static inline int cpupid_to_cpu(int cpupid) { return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; } static inline int cpupid_to_nid(int cpupid) { return cpu_to_node(cpupid_to_cpu(cpupid)); } static inline bool cpupid_pid_unset(int cpupid) { return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); } static inline bool cpupid_cpu_unset(int cpupid) { return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) { return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); } #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK); } static inline int folio_last_cpupid(struct folio *folio) { return folio->_last_cpupid; } static inline void page_cpupid_reset_last(struct page *page) { page->_last_cpupid = -1 & LAST_CPUPID_MASK; } #else static inline int folio_last_cpupid(struct folio *folio) { return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } int folio_xchg_last_cpupid(struct folio *folio, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ static inline int folio_xchg_access_time(struct folio *folio, int time) { int last_time; last_time = folio_xchg_last_cpupid(folio, time >> PAGE_ACCESS_TIME_BUCKETS); return last_time << PAGE_ACCESS_TIME_BUCKETS; } static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { unsigned int pid_bit; pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) { __set_bit(pid_bit, &vma->numab_state->pids_active[1]); } } bool folio_use_access_time(struct folio *folio); #else /* !CONFIG_NUMA_BALANCING */ static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { return folio_nid(folio); /* XXX */ } static inline int folio_xchg_access_time(struct folio *folio, int time) { return 0; } static inline int folio_last_cpupid(struct folio *folio) { return folio_nid(folio); /* XXX */ } static inline int cpupid_to_nid(int cpupid) { return -1; } static inline int cpupid_to_pid(int cpupid) { return -1; } static inline int cpupid_to_cpu(int cpupid) { return -1; } static inline int cpu_pid_to_cpupid(int nid, int pid) { return -1; } static inline bool cpupid_pid_unset(int cpupid) { return true; } static inline void page_cpupid_reset_last(struct page *page) { } static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) { return false; } static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) { } static inline bool folio_use_access_time(struct folio *folio) { return false; } #endif /* CONFIG_NUMA_BALANCING */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) /* * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid * setting tags for all pages to native kernel tag value 0xff, as the default * value 0x00 maps to 0xff. */ static inline u8 page_kasan_tag(const struct page *page) { u8 tag = KASAN_TAG_KERNEL; if (kasan_enabled()) { tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; tag ^= 0xff; } return tag; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { unsigned long old_flags, flags; if (!kasan_enabled()) return; tag ^= 0xff; old_flags = READ_ONCE(page->flags); do { flags = old_flags; flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); } static inline void page_kasan_tag_reset(struct page *page) { if (kasan_enabled()) page_kasan_tag_set(page, KASAN_TAG_KERNEL); } #else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline u8 page_kasan_tag(const struct page *page) { return 0xff; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { } static inline void page_kasan_tag_reset(struct page *page) { } #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; } static inline pg_data_t *page_pgdat(const struct page *page) { return NODE_DATA(page_to_nid(page)); } static inline struct zone *folio_zone(const struct folio *folio) { return page_zone(&folio->page); } static inline pg_data_t *folio_pgdat(const struct folio *folio) { return page_pgdat(&folio->page); } #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } static inline unsigned long page_to_section(const struct page *page) { return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } #endif /** * folio_pfn - Return the Page Frame Number of a folio. * @folio: The folio. * * A folio may contain multiple pages. The pages have consecutive * Page Frame Numbers. * * Return: The Page Frame Number of the first page in the folio. */ static inline unsigned long folio_pfn(const struct folio *folio) { return page_to_pfn(&folio->page); } static inline struct folio *pfn_folio(unsigned long pfn) { return page_folio(pfn_to_page(pfn)); } /** * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA. * @folio: The folio. * * This function checks if a folio has been pinned via a call to * a function in the pin_user_pages() family. * * For small folios, the return value is partially fuzzy: false is not fuzzy, * because it means "definitely not pinned for DMA", but true means "probably * pinned for DMA, but possibly a false positive due to having at least * GUP_PIN_COUNTING_BIAS worth of normal folio references". * * False positives are OK, because: a) it's unlikely for a folio to * get that many refcounts, and b) all the callers of this routine are * expected to be able to deal gracefully with a false positive. * * For large folios, the result will be exactly correct. That's because * we have more tracking data available: the _pincount field is used * instead of the GUP_PIN_COUNTING_BIAS scheme. * * For more information, please see Documentation/core-api/pin_user_pages.rst. * * Return: True, if it is likely that the folio has been "dma-pinned". * False, if the folio is definitely not dma-pinned. */ static inline bool folio_maybe_dma_pinned(struct folio *folio) { if (folio_test_large(folio)) return atomic_read(&folio->_pincount) > 0; /* * folio_ref_count() is signed. If that refcount overflows, then * folio_ref_count() returns a negative value, and callers will avoid * further incrementing the refcount. * * Here, for that overflow case, use the sign bit to count a little * bit higher via unsigned math, and thus still get an accurate result. */ return ((unsigned int)folio_ref_count(folio)) >= GUP_PIN_COUNTING_BIAS; } /* * This should most likely only be called during fork() to see whether we * should break the cow immediately for an anon page on the src mm. * * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq. */ static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma, struct folio *folio) { VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1)); if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)) return false; return folio_maybe_dma_pinned(folio); } /** * is_zero_page - Query if a page is a zero page * @page: The page to query * * This returns true if @page is one of the permanent zero pages. */ static inline bool is_zero_page(const struct page *page) { return is_zero_pfn(page_to_pfn(page)); } /** * is_zero_folio - Query if a folio is a zero page * @folio: The folio to query * * This returns true if @folio is one of the permanent zero pages. */ static inline bool is_zero_folio(const struct folio *folio) { return is_zero_page(&folio->page); } /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */ #ifdef CONFIG_MIGRATION static inline bool folio_is_longterm_pinnable(struct folio *folio) { #ifdef CONFIG_CMA int mt = folio_migratetype(folio); if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif /* The zero page can be "pinned" but gets special handling. */ if (is_zero_folio(folio)) return true; /* Coherent device memory must always allow eviction. */ if (folio_is_device_coherent(folio)) return false; /* Otherwise, non-movable zone folios can be pinned. */ return !folio_is_zone_movable(folio); } #else static inline bool folio_is_longterm_pinnable(struct folio *folio) { return true; } #endif static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; } static inline void set_page_node(struct page *page, unsigned long node) { page->flags &= ~(NODES_MASK << NODES_PGSHIFT); page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } static inline void set_page_links(struct page *page, enum zone_type zone, unsigned long node, unsigned long pfn) { set_page_zone(page, zone); set_page_node(page, node); #ifdef SECTION_IN_PAGE_FLAGS set_page_section(page, pfn_to_section_nr(pfn)); #endif } /** * folio_nr_pages - The number of pages in the folio. * @folio: The folio. * * Return: A positive power of two. */ static inline long folio_nr_pages(const struct folio *folio) { if (!folio_test_large(folio)) return 1; #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else return 1L << (folio->_flags_1 & 0xff); #endif } /* Only hugetlbfs can allocate folios larger than MAX_ORDER */ #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #define MAX_FOLIO_NR_PAGES (1UL << PUD_ORDER) #else #define MAX_FOLIO_NR_PAGES MAX_ORDER_NR_PAGES #endif /* * compound_nr() returns the number of pages in this potentially compound * page. compound_nr() can be called on a tail page, and is defined to * return 1 in that case. */ static inline unsigned long compound_nr(struct page *page) { struct folio *folio = (struct folio *)page; if (!test_bit(PG_head, &folio->flags)) return 1; #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else return 1L << (folio->_flags_1 & 0xff); #endif } /** * thp_nr_pages - The number of regular pages in this huge page. * @page: The head page of a huge page. */ static inline int thp_nr_pages(struct page *page) { return folio_nr_pages((struct folio *)page); } /** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. * * If you have physically contiguous memory which may span more than * one folio (eg a &struct bio_vec), use this function to move from one * folio to the next. Do not use it if the memory is only virtually * contiguous as the folios are almost certainly not adjacent to each * other. This is the folio equivalent to writing ``page++``. * * Context: We assume that the folios are refcounted and/or locked at a * higher level and do not adjust the reference counts. * Return: The next struct folio. */ static inline struct folio *folio_next(struct folio *folio) { return (struct folio *)folio_page(folio, folio_nr_pages(folio)); } /** * folio_shift - The size of the memory described by this folio. * @folio: The folio. * * A folio represents a number of bytes which is a power-of-two in size. * This function tells you which power-of-two the folio is. See also * folio_size() and folio_order(). * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The base-2 logarithm of the size of this folio. */ static inline unsigned int folio_shift(const struct folio *folio) { return PAGE_SHIFT + folio_order(folio); } /** * folio_size - The number of bytes in a folio. * @folio: The folio. * * Context: The caller should have a reference on the folio to prevent * it from being split. It is not necessary for the folio to be locked. * Return: The number of bytes in this folio. */ static inline size_t folio_size(const struct folio *folio) { return PAGE_SIZE << folio_order(folio); } /** * folio_likely_mapped_shared - Estimate if the folio is mapped into the page * tables of more than one MM * @folio: The folio. * * This function checks if the folio is currently mapped into more than one * MM ("mapped shared"), or if the folio is only mapped into a single MM * ("mapped exclusively"). * * For KSM folios, this function also returns "mapped shared" when a folio is * mapped multiple times into the same MM, because the individual page mappings * are independent. * * As precise information is not easily available for all folios, this function * estimates the number of MMs ("sharers") that are currently mapping a folio * using the number of times the first page of the folio is currently mapped * into page tables. * * For small anonymous folios and anonymous hugetlb folios, the return * value will be exactly correct: non-KSM folios can only be mapped at most once * into an MM, and they cannot be partially mapped. KSM folios are * considered shared even if mapped multiple times into the same MM. * * For other folios, the result can be fuzzy: * #. For partially-mappable large folios (THP), the return value can wrongly * indicate "mapped exclusively" (false negative) when the folio is * only partially mapped into at least one MM. * #. For pagecache folios (including hugetlb), the return value can wrongly * indicate "mapped shared" (false positive) when two VMAs in the same MM * cover the same file range. * * Further, this function only considers current page table mappings that * are tracked using the folio mapcount(s). * * This function does not consider: * #. If the folio might get mapped in the (near) future (e.g., swapcache, * pagecache, temporary unmapping for migration). * #. If the folio is mapped differently (VM_PFNMAP). * #. If hugetlb page table sharing applies. Callers might want to check * hugetlb_pmd_shared(). * * Return: Whether the folio is estimated to be mapped into more than one MM. */ static inline bool folio_likely_mapped_shared(struct folio *folio) { int mapcount = folio_mapcount(folio); /* Only partially-mappable folios require more care. */ if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio))) return mapcount > 1; /* A single mapping implies "mapped exclusively". */ if (mapcount <= 1) return false; /* If any page is mapped more than once we treat it "mapped shared". */ if (folio_entire_mapcount(folio) || mapcount > folio_nr_pages(folio)) return true; /* Let's guess based on the first subpage. */ return atomic_read(&folio->_mapcount) > 0; } #ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE static inline int arch_make_folio_accessible(struct folio *folio) { return 0; } #endif /* * Some inline functions in vmstat.h depend on page_zone() */ #include <linux/vmstat.h> #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL #endif #if defined(WANT_PAGE_VIRTUAL) static inline void *page_address(const struct page *page) { return page->virtual; } static inline void set_page_address(struct page *page, void *address) { page->virtual = address; } #define page_address_init() do { } while(0) #endif #if defined(HASHED_PAGE_VIRTUAL) void *page_address(const struct page *page); void set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif static __always_inline void *lowmem_page_address(const struct page *page) { return page_to_virt(page); } #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) #define set_page_address(page, address) do { } while(0) #define page_address_init() do { } while(0) #endif static inline void *folio_address(const struct folio *folio) { return page_address(&folio->page); } /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ static inline bool page_is_pfmemalloc(const struct page *page) { /* * lru.next has bit 1 set if the page is allocated from the * pfmemalloc reserves. Callers may simply overwrite it if * they do not need to preserve that information. */ return (uintptr_t)page->lru.next & BIT(1); } /* * Return true only if the folio has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ static inline bool folio_is_pfmemalloc(const struct folio *folio) { /* * lru.next has bit 1 set if the page is allocated from the * pfmemalloc reserves. Callers may simply overwrite it if * they do not need to preserve that information. */ return (uintptr_t)folio->lru.next & BIT(1); } /* * Only to be called by the page allocator on a freshly allocated * page. */ static inline void set_page_pfmemalloc(struct page *page) { page->lru.next = (void *)BIT(1); } static inline void clear_page_pfmemalloc(struct page *page) { page->lru.next = NULL; } /* * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. */ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) /* * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; /* * Whether to drop the pte markers, for example, the uffd-wp information for * file-backed memory. This should only be specified when we will completely * drop the page in the mm, either by truncation or unmapping of the vma. By * default, the flag is not set. */ #define ZAP_FLAG_DROP_MARKER ((__force zap_flags_t) BIT(0)) /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) #ifdef CONFIG_SCHED_MM_CID void sched_mm_cid_before_execve(struct task_struct *t); void sched_mm_cid_after_execve(struct task_struct *t); void sched_mm_cid_fork(struct task_struct *t); void sched_mm_cid_exit_signals(struct task_struct *t); static inline int task_mm_cid(struct task_struct *t) { return t->mm_cid; } #else static inline void sched_mm_cid_before_execve(struct task_struct *t) { } static inline void sched_mm_cid_after_execve(struct task_struct *t) { } static inline void sched_mm_cid_fork(struct task_struct *t) { } static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } static inline int task_mm_cid(struct task_struct *t) { /* * Use the processor id as a fall-back when the mm cid feature is * disabled. This provides functional per-cpu data structure accesses * in user-space, althrough it won't provide the memory usage benefits. */ return raw_smp_processor_id(); } #endif #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else static inline bool can_do_mlock(void) { return false; } #endif extern int user_shm_lock(size_t, struct ucounts *); extern void user_shm_unlock(size_t, struct ucounts *); struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details); static inline void zap_vma_pages(struct vm_area_struct *vma) { zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); } void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long start, unsigned long end, unsigned long tree_end, bool mm_wr_locked); struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); struct follow_pfnmap_args { /** * Inputs: * @vma: Pointer to @vm_area_struct struct * @address: the virtual address to walk */ struct vm_area_struct *vma; unsigned long address; /** * Internals: * * The caller shouldn't touch any of these. */ spinlock_t *lock; pte_t *ptep; /** * Outputs: * * @pfn: the PFN of the address * @pgprot: the pgprot_t of the mapping * @writable: whether the mapping is writable * @special: whether the mapping is a special mapping (real PFN maps) */ unsigned long pfn; pgprot_t pgprot; bool writable; bool special; }; int follow_pfnmap_start(struct follow_pfnmap_args *args); void follow_pfnmap_end(struct follow_pfnmap_args *args); extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int generic_error_remove_folio(struct address_space *mapping, struct folio *folio); struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm, unsigned long address, struct pt_regs *regs); #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs); extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { /* should never happen if there's no MMU */ BUG(); return VM_FAULT_SIGBUS; } static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { /* should never happen if there's no MMU */ BUG(); return -EFAULT; } static inline void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { } static inline void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { } #endif static inline void unmap_shared_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen) { unmap_mapping_range(mapping, holebegin, holelen, 0); } static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); /* * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT. */ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, unsigned long addr, int gup_flags, struct vm_area_struct **vmap) { struct page *page; struct vm_area_struct *vma; int got; if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT))) return ERR_PTR(-EINVAL); got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); if (got < 0) return ERR_PTR(got); vma = vma_lookup(mm, addr); if (WARN_ON_ONCE(!vma)) { put_page(page); return ERR_PTR(-EINVAL); } *vmap = vma; return page; } long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end, struct folio **folios, unsigned int max_folios, pgoff_t *offset); int folio_add_pins(struct folio *folio, unsigned int pins); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); void folio_add_pin(struct folio *folio); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); struct kvec; struct page *get_dump_page(unsigned long addr); bool folio_mark_dirty(struct folio *folio); bool folio_mark_dirty_lock(struct folio *folio); bool set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); /* * Flags used by change_protection(). For now we make it a bitmap so * that we can pass in multiple flags just like parameters. However * for now all the callers are only use one of the flags at the same * time. */ /* * Whether we should manually check if we can map individual PTEs writable, * because something (e.g., COW, uffd-wp) blocks that from happening for all * PTEs automatically in a writable mapping. */ #define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0) /* Whether this protection change is for NUMA hints */ #define MM_CP_PROT_NUMA (1UL << 1) /* Whether this change is for write protecting */ #define MM_CP_UFFD_WP (1UL << 2) /* do wp */ #define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */ #define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \ MM_CP_UFFD_WP_RESOLVE) bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, pte_t pte); extern long change_protection(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long cp_flags); extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); /* * doesn't attempt to fault and will return short. */ int get_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); static inline bool get_user_page_fast_only(unsigned long addr, unsigned int gup_flags, struct page **pagep) { return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; } /* * per-process(per-mm_struct) statistics. */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) { return percpu_counter_read_positive(&mm->rss_stat[member]); } void mm_trace_rss_stat(struct mm_struct *mm, int member); static inline void add_mm_counter(struct mm_struct *mm, int member, long value) { percpu_counter_add(&mm->rss_stat[member], value); mm_trace_rss_stat(mm, member); } static inline void inc_mm_counter(struct mm_struct *mm, int member) { percpu_counter_inc(&mm->rss_stat[member]); mm_trace_rss_stat(mm, member); } static inline void dec_mm_counter(struct mm_struct *mm, int member) { percpu_counter_dec(&mm->rss_stat[member]); mm_trace_rss_stat(mm, member); } /* Optimized variant when folio is already known not to be anon */ static inline int mm_counter_file(struct folio *folio) { if (folio_test_swapbacked(folio)) return MM_SHMEMPAGES; return MM_FILEPAGES; } static inline int mm_counter(struct folio *folio) { if (folio_test_anon(folio)) return MM_ANONPAGES; return mm_counter_file(folio); } static inline unsigned long get_mm_rss(struct mm_struct *mm) { return get_mm_counter(mm, MM_FILEPAGES) + get_mm_counter(mm, MM_ANONPAGES) + get_mm_counter(mm, MM_SHMEMPAGES); } static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) { return max(mm->hiwater_rss, get_mm_rss(mm)); } static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) { return max(mm->hiwater_vm, mm->total_vm); } static inline void update_hiwater_rss(struct mm_struct *mm) { unsigned long _rss = get_mm_rss(mm); if ((mm)->hiwater_rss < _rss) (mm)->hiwater_rss = _rss; } static inline void update_hiwater_vm(struct mm_struct *mm) { if (mm->hiwater_vm < mm->total_vm) mm->hiwater_vm = mm->total_vm; } static inline void reset_mm_hiwater_rss(struct mm_struct *mm) { mm->hiwater_rss = get_mm_rss(mm); } static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, struct mm_struct *mm) { unsigned long hiwater_rss = get_mm_hiwater_rss(mm); if (*maxrss < hiwater_rss) *maxrss = hiwater_rss; } #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL static inline int pte_special(pte_t pte) { return 0; } static inline pte_t pte_mkspecial(pte_t pte) { return pte; } #endif #ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP static inline bool pmd_special(pmd_t pmd) { return false; } static inline pmd_t pmd_mkspecial(pmd_t pmd) { return pmd; } #endif /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */ #ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP static inline bool pud_special(pud_t pud) { return false; } static inline pud_t pud_mkspecial(pud_t pud) { return pud; } #endif /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */ #ifndef CONFIG_ARCH_HAS_PTE_DEVMAP static inline int pte_devmap(pte_t pte) { return 0; } #endif extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl); static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pte_t *ptep; __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl)); return ptep; } #ifdef __PAGETABLE_P4D_FOLDED static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { return 0; } #else int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); #endif #if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return 0; } static inline void mm_inc_nr_puds(struct mm_struct *mm) {} static inline void mm_dec_nr_puds(struct mm_struct *mm) {} #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); static inline void mm_inc_nr_puds(struct mm_struct *mm) { if (mm_pud_folded(mm)) return; atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_puds(struct mm_struct *mm) { if (mm_pud_folded(mm)) return; atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } #endif #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return 0; } static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); static inline void mm_inc_nr_pmds(struct mm_struct *mm) { if (mm_pmd_folded(mm)) return; atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_pmds(struct mm_struct *mm) { if (mm_pmd_folded(mm)) return; atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } #endif #ifdef CONFIG_MMU static inline void mm_pgtables_bytes_init(struct mm_struct *mm) { atomic_long_set(&mm->pgtables_bytes, 0); } static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return atomic_long_read(&mm->pgtables_bytes); } static inline void mm_inc_nr_ptes(struct mm_struct *mm) { atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_ptes(struct mm_struct *mm) { atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } #else static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {} static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return 0; } static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} #endif int __pte_alloc(struct mm_struct *mm, pmd_t *pmd); int __pte_alloc_kernel(pmd_t *pmd); #if defined(CONFIG_MMU) static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ? NULL : p4d_offset(pgd, address); } static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ? NULL : pud_offset(p4d, address); } static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } #endif /* CONFIG_MMU */ static inline struct ptdesc *virt_to_ptdesc(const void *x) { return page_ptdesc(virt_to_page(x)); } static inline void *ptdesc_to_virt(const struct ptdesc *pt) { return page_to_virt(ptdesc_page(pt)); } static inline void *ptdesc_address(const struct ptdesc *pt) { return folio_address(ptdesc_folio(pt)); } static inline bool pagetable_is_reserved(struct ptdesc *pt) { return folio_test_reserved(ptdesc_folio(pt)); } /** * pagetable_alloc - Allocate pagetables * @gfp: GFP flags * @order: desired pagetable order * * pagetable_alloc allocates memory for page tables as well as a page table * descriptor to describe that memory. * * Return: The ptdesc describing the allocated page tables. */ static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order) { struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); return page_ptdesc(page); } #define pagetable_alloc(...) alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__)) /** * pagetable_free - Free pagetables * @pt: The page table descriptor * * pagetable_free frees the memory of all page tables described by a page * table descriptor and the memory for the descriptor itself. */ static inline void pagetable_free(struct ptdesc *pt) { struct page *page = ptdesc_page(pt); __free_pages(page, compound_order(page)); } #if defined(CONFIG_SPLIT_PTE_PTLOCKS) #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); void ptlock_free(struct ptdesc *ptdesc); static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { return ptdesc->ptl; } #else /* ALLOC_SPLIT_PTLOCKS */ static inline void ptlock_cache_init(void) { } static inline bool ptlock_alloc(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) { } static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { return &ptdesc->ptl; } #endif /* ALLOC_SPLIT_PTLOCKS */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); } static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) { BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE)); BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE); return ptlock_ptr(virt_to_ptdesc(pte)); } static inline bool ptlock_init(struct ptdesc *ptdesc) { /* * prep_new_page() initialize page->private (and therefore page->ptl) * with 0. Make sure nobody took it in use in between. * * It can happen if arch try to use slab for page table allocation: * slab code uses page->slab_cache, which share storage with page->ptl. */ VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc)); if (!ptlock_alloc(ptdesc)) return false; spin_lock_init(ptlock_ptr(ptdesc)); return true; } #else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) { return &mm->page_table_lock; } static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ static inline void __pagetable_ctor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); __folio_set_pgtable(folio); lruvec_stat_add_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); ptlock_free(ptdesc); __folio_clear_pgtable(folio); lruvec_stat_sub_folio(folio, NR_PAGETABLE); } static inline void pagetable_dtor_free(struct ptdesc *ptdesc) { pagetable_dtor(ptdesc); pagetable_free(ptdesc); } static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { if (!ptlock_init(ptdesc)) return false; __pagetable_ctor(ptdesc); return true; } pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { pte_t *pte; __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp)); return pte; } static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { return __pte_offset_map(pmd, addr, NULL); } pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { pte_t *pte; __cond_lock(RCU, __cond_lock(*ptlp, pte = __pte_offset_map_lock(mm, pmd, addr, ptlp))); return pte; } pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp, spinlock_t **ptlp); #define pte_unmap_unlock(pte, ptl) do { \ spin_unlock(ptl); \ pte_unmap(pte); \ } while (0) #define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd)) #define pte_alloc_map(mm, pmd, address) \ (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ (pte_alloc(mm, pmd) ? \ NULL : pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \ NULL: pte_offset_kernel(pmd, address)) #if defined(CONFIG_SPLIT_PMD_PTLOCKS) static inline struct page *pmd_pgtable_page(pmd_t *pmd) { unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1); return virt_to_page((void *)((unsigned long) pmd & mask)); } static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) { return page_ptdesc(pmd_pgtable_page(pmd)); } static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return ptlock_ptr(pmd_ptdesc(pmd)); } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptdesc->pmd_huge_pte = NULL; #endif return ptlock_init(ptdesc); } #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) #endif static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl = pmd_lockptr(mm, pmd); spin_lock(ptl); return ptl; } static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) { if (!pmd_ptlock_init(ptdesc)) return false; ptdesc_pmd_pts_init(ptdesc); __pagetable_ctor(ptdesc); return true; } /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be * considered ready to switch to split PUD locks yet; there may be places * which need to be converted from page_table_lock. */ static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud) { return &mm->page_table_lock; } static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) { spinlock_t *ptl = pud_lockptr(mm, pud); spin_lock(ptl); return ptl; } static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc) { __pagetable_ctor(ptdesc); } extern void __init pagecache_init(void); extern void free_initmem(void); /* * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK) * into the buddy system. The freed pages will be poisoned with pattern * "poison" if it's within range [0, UCHAR_MAX]. * Return pages freed into the buddy system. */ extern unsigned long free_reserved_area(void *start, void *end, int poison, const char *s); extern void adjust_managed_page_count(struct page *page, long count); extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end, int nid); /* Free the reserved page into the buddy system, so it gets managed. */ void free_reserved_page(struct page *page); #define free_highmem_page(page) free_reserved_page(page) static inline void mark_page_reserved(struct page *page) { SetPageReserved(page); adjust_managed_page_count(page, -1); } static inline void free_reserved_ptdesc(struct ptdesc *pt) { free_reserved_page(ptdesc_page(pt)); } /* * Default method to free all the __init memory into the buddy system. * The freed pages will be poisoned with pattern "poison" if it's within * range [0, UCHAR_MAX]. * Return pages freed into the buddy system. */ static inline unsigned long free_initmem_default(int poison) { extern char __init_begin[], __init_end[]; return free_reserved_area(&__init_begin, &__init_end, poison, "unused kernel image (initmem)"); } static inline unsigned long get_num_physpages(void) { int nid; unsigned long phys_pages = 0; for_each_online_node(nid) phys_pages += node_present_pages(nid); return phys_pages; } /* * Using memblock node mappings, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in an * architecture independent manner. * * An architecture is expected to register range of page frames backed by * physical memory with memblock_add[_node]() before calling * free_area_init() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() * memblock_add_node(base, size, nid, MEMBLOCK_NONE) * free_area_init(max_zone_pfns); */ void free_area_init(unsigned long *max_zone_pfn); unsigned long node_map_pfn_alignment(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); #ifndef CONFIG_NUMA static inline int early_pfn_to_nid(unsigned long pfn) { return 0; } #else /* please see mm/page_alloc.c */ extern int __meminit early_pfn_to_nid(unsigned long pfn); #endif extern void mem_init(void); extern void __init mmap_init(void); extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); static inline void show_mem(void) { __show_mem(0, NULL, MAX_NR_ZONES - 1); } extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); extern __printf(3, 4) void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...); extern void setup_per_cpu_pageset(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); /* interval_tree.c */ void vma_interval_tree_insert(struct vm_area_struct *node, struct rb_root_cached *root); void vma_interval_tree_insert_after(struct vm_area_struct *node, struct vm_area_struct *prev, struct rb_root_cached *root); void vma_interval_tree_remove(struct vm_area_struct *node, struct rb_root_cached *root); struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, unsigned long start, unsigned long last); #define vma_interval_tree_foreach(vma, root, start, last) \ for (vma = vma_interval_tree_iter_first(root, start, last); \ vma; vma = vma_interval_tree_iter_next(vma, start, last)) void anon_vma_interval_tree_insert(struct anon_vma_chain *node, struct rb_root_cached *root); void anon_vma_interval_tree_remove(struct anon_vma_chain *node, struct rb_root_cached *root); struct anon_vma_chain * anon_vma_interval_tree_iter_first(struct rb_root_cached *root, unsigned long start, unsigned long last); struct anon_vma_chain *anon_vma_interval_tree_iter_next( struct anon_vma_chain *node, unsigned long start, unsigned long last); #ifdef CONFIG_DEBUG_VM_RB void anon_vma_interval_tree_verify(struct anon_vma_chain *node); #endif #define anon_vma_interval_tree_foreach(avc, root, start, last) \ for (avc = anon_vma_interval_tree_iter_first(root, start, last); \ avc; avc = anon_vma_interval_tree_iter_next(avc, start, last)) /* mmap.c */ extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void exit_mmap(struct mm_struct *); int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, bool write); static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, unsigned long end_data, unsigned long start_data) { if (rlim < RLIM_INFINITY) { if (((new - start) + (end_data - start_data)) > rlim) return -ENOSPC; } return 0; } extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); extern struct file *get_task_exe_file(struct task_struct *task); extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); extern bool vma_is_special_mapping(const struct vm_area_struct *vma, const struct vm_special_mapping *sm); extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, const struct vm_special_mapping *spec); unsigned long randomize_stack_top(unsigned long stack_top); unsigned long randomize_page(unsigned long start, unsigned long range); unsigned long __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags); static inline unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { return __get_unmapped_area(file, addr, len, pgoff, flags, 0); } extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); #ifdef CONFIG_MMU extern int __mm_populate(unsigned long addr, unsigned long len, int ignore_errors); static inline void mm_populate(unsigned long addr, unsigned long len) { /* Ignore errors */ (void) __mm_populate(addr, len, 1); } #else static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif /* This takes the mm semaphore itself */ extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); struct vm_unmapped_area_info { #define VM_UNMAPPED_AREA_TOPDOWN 1 unsigned long flags; unsigned long length; unsigned long low_limit; unsigned long high_limit; unsigned long align_mask; unsigned long align_offset; unsigned long start_gap; }; extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info); /* truncate.c */ extern void truncate_inode_pages(struct address_space *, loff_t); extern void truncate_inode_pages_range(struct address_space *, loff_t lstart, loff_t lend); extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); extern vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf); extern unsigned long stack_guard_gap; /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */ int expand_stack_locked(struct vm_area_struct *vma, unsigned long address); struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr); extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); /* * Look up the first VMA which intersects the interval [start_addr, end_addr) * NULL if none. Assume start_addr < end_addr. */ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr); /** * vma_lookup() - Find a VMA at a specific address * @mm: The process address space. * @addr: The user address. * * Return: The vm_area_struct at the given address, %NULL otherwise. */ static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) { return mtree_load(&mm->mm_mt, addr); } static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma) { if (vma->vm_flags & VM_GROWSDOWN) return stack_guard_gap; /* See reasoning around the VM_SHADOW_STACK definition */ if (vma->vm_flags & VM_SHADOW_STACK) return PAGE_SIZE; return 0; } static inline unsigned long vm_start_gap(struct vm_area_struct *vma) { unsigned long gap = stack_guard_start_gap(vma); unsigned long vm_start = vma->vm_start; vm_start -= gap; if (vm_start > vma->vm_start) vm_start = 0; return vm_start; } static inline unsigned long vm_end_gap(struct vm_area_struct *vma) { unsigned long vm_end = vma->vm_end; if (vma->vm_flags & VM_GROWSUP) { vm_end += stack_guard_gap; if (vm_end < vma->vm_end) vm_end = -PAGE_SIZE; } return vm_end; } static inline unsigned long vma_pages(struct vm_area_struct *vma) { return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } /* Look up the first VMA which exactly match the interval vm_start ... vm_end */ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) { struct vm_area_struct *vma = vma_lookup(mm, vm_start); if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) vma = NULL; return vma; } static inline bool range_in_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { return (vma && vma->vm_start <= start && end <= vma->vm_end); } #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(unsigned long vm_flags); void vma_set_page_prot(struct vm_area_struct *vma); #else static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) { return __pgprot(0); } static inline void vma_set_page_prot(struct vm_area_struct *vma) { vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); } #endif void vma_set_file(struct vm_area_struct *vma, struct file *file); #ifdef CONFIG_NUMA_BALANCING unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif struct vm_area_struct *find_extend_vma_locked(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num); int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num); int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, unsigned long num); vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { int err = vm_insert_page(vma, addr, page); if (err == -ENOMEM) return VM_FAULT_OOM; if (err < 0 && err != -EBUSY) return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } #ifndef io_remap_pfn_range static inline int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot)); } #endif static inline vm_fault_t vmf_error(int err) { if (err == -ENOMEM) return VM_FAULT_OOM; else if (err == -EHWPOISON) return VM_FAULT_HWPOISON; return VM_FAULT_SIGBUS; } /* * Convert errno to return value for ->page_mkwrite() calls. * * This should eventually be merged with vmf_error() above, but will need a * careful audit of all vmf_error() callers. */ static inline vm_fault_t vmf_fs_error(int err) { if (err == 0) return VM_FAULT_LOCKED; if (err == -EFAULT || err == -EAGAIN) return VM_FAULT_NOPAGE; if (err == -ENOMEM) return VM_FAULT_OOM; /* -ENOSPC, -EDQUOT, -EIO ... */ return VM_FAULT_SIGBUS; } static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) return -ENOMEM; if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT; if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) return -EFAULT; return 0; } /* * Indicates whether GUP can follow a PROT_NONE mapped page, or whether * a (NUMA hinting) fault is required. */ static inline bool gup_can_follow_protnone(struct vm_area_struct *vma, unsigned int flags) { /* * If callers don't want to honor NUMA hinting faults, no need to * determine if we would actually have to trigger a NUMA hinting fault. */ if (!(flags & FOLL_HONOR_NUMA_FAULT)) return true; /* * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs. * * Requiring a fault here even for inaccessible VMAs would mean that * FOLL_FORCE cannot make any progress, because handle_mm_fault() * refuses to process NUMA hinting faults in inaccessible VMAs. */ return !vma_is_accessible(vma); } typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); #ifdef CONFIG_PAGE_POISONING extern void __kernel_poison_pages(struct page *page, int numpages); extern void __kernel_unpoison_pages(struct page *page, int numpages); extern bool _page_poisoning_enabled_early; DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled); static inline bool page_poisoning_enabled(void) { return _page_poisoning_enabled_early; } /* * For use in fast paths after init_mem_debugging() has run, or when a * false negative result is not harmful when called too early. */ static inline bool page_poisoning_enabled_static(void) { return static_branch_unlikely(&_page_poisoning_enabled); } static inline void kernel_poison_pages(struct page *page, int numpages) { if (page_poisoning_enabled_static()) __kernel_poison_pages(page, numpages); } static inline void kernel_unpoison_pages(struct page *page, int numpages) { if (page_poisoning_enabled_static()) __kernel_unpoison_pages(page, numpages); } #else static inline bool page_poisoning_enabled(void) { return false; } static inline bool page_poisoning_enabled_static(void) { return false; } static inline void __kernel_poison_pages(struct page *page, int nunmpages) { } static inline void kernel_poison_pages(struct page *page, int numpages) { } static inline void kernel_unpoison_pages(struct page *page, int numpages) { } #endif DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); static inline bool want_init_on_alloc(gfp_t flags) { if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, &init_on_alloc)) return true; return flags & __GFP_ZERO; } DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); static inline bool want_init_on_free(void) { return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON, &init_on_free); } extern bool _debug_pagealloc_enabled_early; DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); static inline bool debug_pagealloc_enabled(void) { return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && _debug_pagealloc_enabled_early; } /* * For use in fast paths after mem_debugging_and_hardening_init() has run, * or when a false negative result is not harmful when called too early. */ static inline bool debug_pagealloc_enabled_static(void) { if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) return false; return static_branch_unlikely(&_debug_pagealloc_enabled); } /* * To support DEBUG_PAGEALLOC architecture must ensure that * __kernel_map_pages() never fails */ extern void __kernel_map_pages(struct page *page, int numpages, int enable); #ifdef CONFIG_DEBUG_PAGEALLOC static inline void debug_pagealloc_map_pages(struct page *page, int numpages) { if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 1); } static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) { if (debug_pagealloc_enabled_static()) __kernel_map_pages(page, numpages, 0); } extern unsigned int _debug_guardpage_minorder; DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled); static inline unsigned int debug_guardpage_minorder(void) { return _debug_guardpage_minorder; } static inline bool debug_guardpage_enabled(void) { return static_branch_unlikely(&_debug_guardpage_enabled); } static inline bool page_is_guard(struct page *page) { if (!debug_guardpage_enabled()) return false; return PageGuard(page); } bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order) { if (!debug_guardpage_enabled()) return false; return __set_page_guard(zone, page, order); } void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order); static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) { if (!debug_guardpage_enabled()) return; __clear_page_guard(zone, page, order); } #else /* CONFIG_DEBUG_PAGEALLOC */ static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} static inline unsigned int debug_guardpage_minorder(void) { return 0; } static inline bool debug_guardpage_enabled(void) { return false; } static inline bool page_is_guard(struct page *page) { return false; } static inline bool set_page_guard(struct zone *zone, struct page *page, unsigned int order) { return false; } static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order) {} #endif /* CONFIG_DEBUG_PAGEALLOC */ #ifdef __HAVE_ARCH_GATE_AREA extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm); extern int in_gate_area_no_mm(unsigned long addr); extern int in_gate_area(struct mm_struct *mm, unsigned long addr); #else static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { return NULL; } static inline int in_gate_area_no_mm(unsigned long addr) { return 0; } static inline int in_gate_area(struct mm_struct *mm, unsigned long addr) { return 0; } #endif /* __HAVE_ARCH_GATE_AREA */ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; int drop_caches_sysctl_handler(const struct ctl_table *, int, void *, size_t *, loff_t *); #endif void drop_slab(void); #ifndef CONFIG_MMU #define randomize_va_space 0 #else extern int randomize_va_space; #endif const char * arch_vma_name(struct vm_area_struct *vma); #ifdef CONFIG_MMU void print_vma_addr(char *prefix, unsigned long rip); #else static inline void print_vma_addr(char *prefix, unsigned long rip) { } #endif void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node, struct vmem_altmap *altmap, struct page *reuse); void *vmemmap_alloc_block(unsigned long size, int node); struct vmem_altmap; void *vmemmap_alloc_block_buf(unsigned long size, int node, struct vmem_altmap *altmap); void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); void vmemmap_set_pmd(pmd_t *pmd, void *p, int node, unsigned long addr, unsigned long next); int vmemmap_check_pmd(pmd_t *pmd, int node, unsigned long addr, unsigned long next); int vmemmap_populate_basepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate_hugepages(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); int vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); void vmemmap_populate_print_last(void); #ifdef CONFIG_MEMORY_HOTPLUG void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { /* number of pfns from base where pfn_to_page() is valid */ if (altmap) return altmap->reserve + altmap->free; return 0; } static inline void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) { altmap->alloc -= nr_pfns; } #else static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { return 0; } static inline void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) { } #endif #define VMEMMAP_RESERVE_NR 2 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { unsigned long nr_pages; unsigned long nr_vmemmap_pages; if (!pgmap || !is_power_of_2(sizeof(struct page))) return false; nr_pages = pgmap_vmemmap_nr(pgmap); nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT); /* * For vmemmap optimization with DAX we need minimum 2 vmemmap * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst */ return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR); } /* * If we don't have an architecture override, use the generic rule */ #ifndef vmemmap_can_optimize #define vmemmap_can_optimize __vmemmap_can_optimize #endif #else static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { return false; } #endif void register_page_bootmem_memmap(unsigned long section_nr, struct page *map, unsigned long nr_pages); enum mf_flags { MF_COUNT_INCREASED = 1 << 0, MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, MF_NO_RETRY = 1 << 6, MF_MEM_PRE_REMOVE = 1 << 7, }; int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue_kick(int cpu); extern int unpoison_memory(unsigned long pfn); extern atomic_long_t num_poisoned_pages __read_mostly; extern int soft_offline_page(unsigned long pfn, int flags); #ifdef CONFIG_MEMORY_FAILURE /* * Sysfs entries for memory failure handling statistics. */ extern const struct attribute_group memory_failure_attr_group; extern void memory_failure_queue(unsigned long pfn, int flags); extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void num_poisoned_pages_inc(unsigned long pfn); void num_poisoned_pages_sub(unsigned long pfn, long i); #else static inline void memory_failure_queue(unsigned long pfn, int flags) { } static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared) { return 0; } static inline void num_poisoned_pages_inc(unsigned long pfn) { } static inline void num_poisoned_pages_sub(unsigned long pfn, long i) { } #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) extern void memblk_nr_poison_inc(unsigned long pfn); extern void memblk_nr_poison_sub(unsigned long pfn, long i); #else static inline void memblk_nr_poison_inc(unsigned long pfn) { } static inline void memblk_nr_poison_sub(unsigned long pfn, long i) { } #endif #ifndef arch_memory_failure static inline int arch_memory_failure(unsigned long pfn, int flags) { return -ENXIO; } #endif #ifndef arch_is_platform_page static inline bool arch_is_platform_page(u64 paddr) { return false; } #endif /* * Error handlers for various types of pages. */ enum mf_result { MF_IGNORED, /* Error: cannot be handled */ MF_FAILED, /* Error: handling failed */ MF_DELAYED, /* Will be handled later */ MF_RECOVERED, /* Successfully recovered */ }; enum mf_action_page_type { MF_MSG_KERNEL, MF_MSG_KERNEL_HIGH_ORDER, MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, MF_MSG_GET_HWPOISON, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, MF_MSG_DIRTY_MLOCKED_LRU, MF_MSG_CLEAN_MLOCKED_LRU, MF_MSG_DIRTY_UNEVICTABLE_LRU, MF_MSG_CLEAN_UNEVICTABLE_LRU, MF_MSG_DIRTY_LRU, MF_MSG_CLEAN_LRU, MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_ALREADY_POISONED, MF_MSG_UNKNOWN, }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) void folio_zero_user(struct folio *folio, unsigned long addr_hint); int copy_user_large_folio(struct folio *dst, struct folio *src, unsigned long addr_hint, struct vm_area_struct *vma); long copy_folio_from_user(struct folio *dst_folio, const void __user *usr_src, bool allow_pagefault); /** * vma_is_special_huge - Are transhuge page-table entries considered special? * @vma: Pointer to the struct vm_area_struct to consider * * Whether transhuge page-table entries are considered "special" following * the definition in vm_normal_page(). * * Return: true if transhuge page-table entries should be considered special, * false otherwise. */ static inline bool vma_is_special_huge(const struct vm_area_struct *vma) { return vma_is_dax(vma) || (vma->vm_file && (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if MAX_NUMNODES > 1 void __init setup_nr_node_ids(void); #else static inline void setup_nr_node_ids(void) {} #endif extern int memcmp_pages(struct page *page1, struct page *page2); static inline int pages_identical(struct page *page1, struct page *page2) { return !memcmp_pages(page1, page2); } #ifdef CONFIG_MAPPING_DIRTY_HELPERS unsigned long clean_record_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, pgoff_t bitmap_pgoff, unsigned long *bitmap, pgoff_t *start, pgoff_t *end); unsigned long wp_shared_mapping_range(struct address_space *mapping, pgoff_t first_index, pgoff_t nr); #endif extern int sysctl_nr_trim_pages; #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name); #else static inline int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, struct anon_vma_name *anon_name) { return 0; } #endif #ifdef CONFIG_UNACCEPTED_MEMORY bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size); void accept_memory(phys_addr_t start, unsigned long size); #else static inline bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size) { return false; } static inline void accept_memory(phys_addr_t start, unsigned long size) { } #endif static inline bool pfn_is_unaccepted_memory(unsigned long pfn) { return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE); } void vma_pgtable_walk_begin(struct vm_area_struct *vma); void vma_pgtable_walk_end(struct vm_area_struct *vma); int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); #ifdef CONFIG_64BIT int do_mseal(unsigned long start, size_t len_in, unsigned long flags); #else static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags) { /* noop on 32 bit */ return 0; } #endif /* * user_alloc_needs_zeroing checks if a user folio from page allocator needs to * be zeroed or not. */ static inline bool user_alloc_needs_zeroing(void) { /* * for user folios, arch with cache aliasing requires cache flush and * arc changes folio->flags to make icache coherent with dcache, so * always return false to make caller use * clear_user_page()/clear_user_highpage(). */ return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() || !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, &init_on_alloc); } int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status); int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status); int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #endif /* _LINUX_MM_H */ |
17 17 17 17 15 15 15 17 17 17 17 17 15 15 15 3 1 1 7 1 2 4 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | /* * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #include <linux/export.h> #include <drm/drm_bridge.h> #include <drm/drm_device.h> #include <drm/drm_drv.h> #include <drm/drm_encoder.h> #include <drm/drm_managed.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_internal.h" /** * DOC: overview * * Encoders represent the connecting element between the CRTC (as the overall * pixel pipeline, represented by &struct drm_crtc) and the connectors (as the * generic sink entity, represented by &struct drm_connector). An encoder takes * pixel data from a CRTC and converts it to a format suitable for any attached * connector. Encoders are objects exposed to userspace, originally to allow * userspace to infer cloning and connector/CRTC restrictions. Unfortunately * almost all drivers get this wrong, making the uabi pretty much useless. On * top of that the exposed restrictions are too simple for today's hardware, and * the recommended way to infer restrictions is by using the * DRM_MODE_ATOMIC_TEST_ONLY flag for the atomic IOCTL. * * Otherwise encoders aren't used in the uapi at all (any modeset request from * userspace directly connects a connector with a CRTC), drivers are therefore * free to use them however they wish. Modeset helper libraries make strong use * of encoders to facilitate code sharing. But for more complex settings it is * usually better to move shared code into a separate &drm_bridge. Compared to * encoders, bridges also have the benefit of being purely an internal * abstraction since they are not exposed to userspace at all. * * Encoders are initialized with drm_encoder_init() and cleaned up using * drm_encoder_cleanup(). */ static const struct drm_prop_enum_list drm_encoder_enum_list[] = { { DRM_MODE_ENCODER_NONE, "None" }, { DRM_MODE_ENCODER_DAC, "DAC" }, { DRM_MODE_ENCODER_TMDS, "TMDS" }, { DRM_MODE_ENCODER_LVDS, "LVDS" }, { DRM_MODE_ENCODER_TVDAC, "TV" }, { DRM_MODE_ENCODER_VIRTUAL, "Virtual" }, { DRM_MODE_ENCODER_DSI, "DSI" }, { DRM_MODE_ENCODER_DPMST, "DP MST" }, { DRM_MODE_ENCODER_DPI, "DPI" }, }; int drm_encoder_register_all(struct drm_device *dev) { struct drm_encoder *encoder; int ret = 0; drm_for_each_encoder(encoder, dev) { drm_debugfs_encoder_add(encoder); if (encoder->funcs && encoder->funcs->late_register) ret = encoder->funcs->late_register(encoder); if (ret) return ret; } return 0; } void drm_encoder_unregister_all(struct drm_device *dev) { struct drm_encoder *encoder; drm_for_each_encoder(encoder, dev) { if (encoder->funcs && encoder->funcs->early_unregister) encoder->funcs->early_unregister(encoder); drm_debugfs_encoder_remove(encoder); } } __printf(5, 0) static int __drm_encoder_init(struct drm_device *dev, struct drm_encoder *encoder, const struct drm_encoder_funcs *funcs, int encoder_type, const char *name, va_list ap) { int ret; /* encoder index is used with 32bit bitmasks */ if (WARN_ON(dev->mode_config.num_encoder >= 32)) return -EINVAL; ret = drm_mode_object_add(dev, &encoder->base, DRM_MODE_OBJECT_ENCODER); if (ret) return ret; encoder->dev = dev; encoder->encoder_type = encoder_type; encoder->funcs = funcs; if (name) { encoder->name = kvasprintf(GFP_KERNEL, name, ap); } else { encoder->name = kasprintf(GFP_KERNEL, "%s-%d", drm_encoder_enum_list[encoder_type].name, encoder->base.id); } if (!encoder->name) { ret = -ENOMEM; goto out_put; } INIT_LIST_HEAD(&encoder->bridge_chain); list_add_tail(&encoder->head, &dev->mode_config.encoder_list); encoder->index = dev->mode_config.num_encoder++; out_put: if (ret) drm_mode_object_unregister(dev, &encoder->base); return ret; } /** * drm_encoder_init - Init a preallocated encoder * @dev: drm device * @encoder: the encoder to init * @funcs: callbacks for this encoder * @encoder_type: user visible type of the encoder * @name: printf style format string for the encoder name, or NULL for default name * * Initializes a preallocated encoder. Encoder should be subclassed as part of * driver encoder objects. At driver unload time the driver's * &drm_encoder_funcs.destroy hook should call drm_encoder_cleanup() and kfree() * the encoder structure. The encoder structure should not be allocated with * devm_kzalloc(). * * Note: consider using drmm_encoder_alloc() or drmm_encoder_init() * instead of drm_encoder_init() to let the DRM managed resource * infrastructure take care of cleanup and deallocation. * * Returns: * Zero on success, error code on failure. */ int drm_encoder_init(struct drm_device *dev, struct drm_encoder *encoder, const struct drm_encoder_funcs *funcs, int encoder_type, const char *name, ...) { va_list ap; int ret; WARN_ON(!funcs->destroy); va_start(ap, name); ret = __drm_encoder_init(dev, encoder, funcs, encoder_type, name, ap); va_end(ap); return ret; } EXPORT_SYMBOL(drm_encoder_init); /** * drm_encoder_cleanup - cleans up an initialised encoder * @encoder: encoder to cleanup * * Cleans up the encoder but doesn't free the object. */ void drm_encoder_cleanup(struct drm_encoder *encoder) { struct drm_device *dev = encoder->dev; struct drm_bridge *bridge, *next; /* Note that the encoder_list is considered to be static; should we * remove the drm_encoder at runtime we would have to decrement all * the indices on the drm_encoder after us in the encoder_list. */ list_for_each_entry_safe(bridge, next, &encoder->bridge_chain, chain_node) drm_bridge_detach(bridge); drm_mode_object_unregister(dev, &encoder->base); kfree(encoder->name); list_del(&encoder->head); dev->mode_config.num_encoder--; memset(encoder, 0, sizeof(*encoder)); } EXPORT_SYMBOL(drm_encoder_cleanup); static void drmm_encoder_alloc_release(struct drm_device *dev, void *ptr) { struct drm_encoder *encoder = ptr; if (WARN_ON(!encoder->dev)) return; drm_encoder_cleanup(encoder); } __printf(5, 0) static int __drmm_encoder_init(struct drm_device *dev, struct drm_encoder *encoder, const struct drm_encoder_funcs *funcs, int encoder_type, const char *name, va_list args) { int ret; if (drm_WARN_ON(dev, funcs && funcs->destroy)) return -EINVAL; ret = __drm_encoder_init(dev, encoder, funcs, encoder_type, name, args); if (ret) return ret; ret = drmm_add_action_or_reset(dev, drmm_encoder_alloc_release, encoder); if (ret) return ret; return 0; } void *__drmm_encoder_alloc(struct drm_device *dev, size_t size, size_t offset, const struct drm_encoder_funcs *funcs, int encoder_type, const char *name, ...) { void *container; struct drm_encoder *encoder; va_list ap; int ret; container = drmm_kzalloc(dev, size, GFP_KERNEL); if (!container) return ERR_PTR(-ENOMEM); encoder = container + offset; va_start(ap, name); ret = __drmm_encoder_init(dev, encoder, funcs, encoder_type, name, ap); va_end(ap); if (ret) return ERR_PTR(ret); return container; } EXPORT_SYMBOL(__drmm_encoder_alloc); /** * drmm_encoder_init - Initialize a preallocated encoder * @dev: drm device * @encoder: the encoder to init * @funcs: callbacks for this encoder (optional) * @encoder_type: user visible type of the encoder * @name: printf style format string for the encoder name, or NULL for default name * * Initializes a preallocated encoder. Encoder should be subclassed as * part of driver encoder objects. Cleanup is automatically handled * through registering drm_encoder_cleanup() with drmm_add_action(). The * encoder structure should be allocated with drmm_kzalloc(). * * The @drm_encoder_funcs.destroy hook must be NULL. * * Returns: * Zero on success, error code on failure. */ int drmm_encoder_init(struct drm_device *dev, struct drm_encoder *encoder, const struct drm_encoder_funcs *funcs, int encoder_type, const char *name, ...) { va_list ap; int ret; va_start(ap, name); ret = __drmm_encoder_init(dev, encoder, funcs, encoder_type, name, ap); va_end(ap); if (ret) return ret; return 0; } EXPORT_SYMBOL(drmm_encoder_init); static struct drm_crtc *drm_encoder_get_crtc(struct drm_encoder *encoder) { struct drm_connector *connector; struct drm_device *dev = encoder->dev; bool uses_atomic = false; struct drm_connector_list_iter conn_iter; /* For atomic drivers only state objects are synchronously updated and * protected by modeset locks, so check those first. */ drm_connector_list_iter_begin(dev, &conn_iter); drm_for_each_connector_iter(connector, &conn_iter) { if (!connector->state) continue; uses_atomic = true; if (connector->state->best_encoder != encoder) continue; drm_connector_list_iter_end(&conn_iter); return connector->state->crtc; } drm_connector_list_iter_end(&conn_iter); /* Don't return stale data (e.g. pending async disable). */ if (uses_atomic) return NULL; return encoder->crtc; } int drm_mode_getencoder(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_get_encoder *enc_resp = data; struct drm_encoder *encoder; struct drm_crtc *crtc; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; encoder = drm_encoder_find(dev, file_priv, enc_resp->encoder_id); if (!encoder) return -ENOENT; drm_modeset_lock(&dev->mode_config.connection_mutex, NULL); crtc = drm_encoder_get_crtc(encoder); if (crtc && drm_lease_held(file_priv, crtc->base.id)) enc_resp->crtc_id = crtc->base.id; else enc_resp->crtc_id = 0; drm_modeset_unlock(&dev->mode_config.connection_mutex); enc_resp->encoder_type = encoder->encoder_type; enc_resp->encoder_id = encoder->base.id; enc_resp->possible_crtcs = drm_lease_filter_crtcs(file_priv, encoder->possible_crtcs); enc_resp->possible_clones = encoder->possible_clones; return 0; } |
3 4 3 4 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 | /* SPDX-License-Identifier: GPL-2.0 */ #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include "amigaffs.h" #include <linux/mutex.h> #include <linux/workqueue.h> /* Ugly macros make the code more pretty. */ #define AFFS_BLOCK(sb, bh, blk) (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)]) #define AFFS_HEAD(bh) ((struct affs_head *)(bh)->b_data) #define AFFS_TAIL(sb, bh) ((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail))) #define AFFS_ROOT_HEAD(bh) ((struct affs_root_head *)(bh)->b_data) #define AFFS_ROOT_TAIL(sb, bh) ((struct affs_root_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_root_tail))) #define AFFS_DATA_HEAD(bh) ((struct affs_data_head *)(bh)->b_data) #define AFFS_DATA(bh) (((struct affs_data_head *)(bh)->b_data)->data) #define AFFS_CACHE_SIZE PAGE_SIZE #define AFFS_LC_SIZE (AFFS_CACHE_SIZE/sizeof(u32)/2) #define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2) #define AFFS_AC_MASK (AFFS_AC_SIZE-1) #define AFFSNAMEMAX 30U struct affs_ext_key { u32 ext; /* idx of the extended block */ u32 key; /* block number */ }; /* * affs fs inode data in memory */ struct affs_inode_info { atomic_t i_opencnt; struct mutex i_link_lock; /* Protects internal inode access. */ struct mutex i_ext_lock; /* Protects internal inode access. */ #define i_hash_lock i_ext_lock u32 i_blkcnt; /* block count */ u32 i_extcnt; /* extended block count */ u32 *i_lc; /* linear cache of extended blocks */ u32 i_lc_size; u32 i_lc_shift; u32 i_lc_mask; struct affs_ext_key *i_ac; /* associative cache of extended blocks */ u32 i_ext_last; /* last accessed extended block */ struct buffer_head *i_ext_bh; /* bh of last extended block */ loff_t mmu_private; u32 i_protect; /* unused attribute bits */ u32 i_lastalloc; /* last allocated block */ int i_pa_cnt; /* number of preallocated blocks */ struct inode vfs_inode; }; /* short cut to get to the affs specific inode data */ static inline struct affs_inode_info *AFFS_I(struct inode *inode) { return container_of(inode, struct affs_inode_info, vfs_inode); } /* * super-block data in memory * * Block numbers are adjusted for their actual size * */ struct affs_bm_info { u32 bm_key; /* Disk block number */ u32 bm_free; /* Free blocks in here */ }; struct affs_sb_info { int s_partition_size; /* Partition size in blocks. */ int s_reserved; /* Number of reserved blocks. */ //u32 s_blksize; /* Initial device blksize */ u32 s_data_blksize; /* size of the data block w/o header */ u32 s_root_block; /* FFS root block number. */ int s_hashsize; /* Size of hash table. */ unsigned long s_flags; /* See below. */ kuid_t s_uid; /* uid to override */ kgid_t s_gid; /* gid to override */ umode_t s_mode; /* mode to override */ struct buffer_head *s_root_bh; /* Cached root block. */ struct mutex s_bmlock; /* Protects bitmap access. */ struct affs_bm_info *s_bitmap; /* Bitmap infos. */ u32 s_bmap_count; /* # of bitmap blocks. */ u32 s_bmap_bits; /* # of bits in one bitmap blocks */ u32 s_last_bmap; struct buffer_head *s_bmap_bh; char *s_prefix; /* Prefix for volumes and assigns. */ char s_volume[32]; /* Volume prefix for absolute symlinks. */ spinlock_t symlink_lock; /* protects the previous two */ struct super_block *sb; /* the VFS superblock object */ int work_queued; /* non-zero delayed work is queued */ struct delayed_work sb_work; /* superblock flush delayed work */ spinlock_t work_lock; /* protects sb_work and work_queued */ struct rcu_head rcu; }; #define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */ #define AFFS_MOUNT_SF_BM_VALID 0x0002 /* Bitmap is valid. */ #define AFFS_MOUNT_SF_IMMUTABLE 0x0004 /* Protection bits cannot be changed */ #define AFFS_MOUNT_SF_QUIET 0x0008 /* chmod errors will be not reported */ #define AFFS_MOUNT_SF_SETUID 0x0010 /* Ignore Amiga uid */ #define AFFS_MOUNT_SF_SETGID 0x0020 /* Ignore Amiga gid */ #define AFFS_MOUNT_SF_SETMODE 0x0040 /* Ignore Amiga protection bits */ #define AFFS_MOUNT_SF_MUFS 0x0100 /* Use MUFS uid/gid mapping */ #define AFFS_MOUNT_SF_OFS 0x0200 /* Old filesystem */ #define AFFS_MOUNT_SF_PREFIX 0x0400 /* Buffer for prefix is allocated */ #define AFFS_MOUNT_SF_VERBOSE 0x0800 /* Talk about fs when mounting */ #define AFFS_MOUNT_SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */ #define affs_clear_opt(o, opt) (o &= ~AFFS_MOUNT_##opt) #define affs_set_opt(o, opt) (o |= AFFS_MOUNT_##opt) #define affs_test_opt(o, opt) ((o) & AFFS_MOUNT_##opt) /* short cut to get to the affs specific sb data */ static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) { return sb->s_fs_info; } void affs_mark_sb_dirty(struct super_block *sb); /* amigaffs.c */ extern int affs_insert_hash(struct inode *inode, struct buffer_head *bh); extern int affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh); extern int affs_remove_header(struct dentry *dentry); extern u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh); extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); extern void affs_secs_to_datestamp(time64_t secs, struct affs_date *ds); extern umode_t affs_prot_to_mode(u32 prot); extern void affs_mode_to_prot(struct inode *inode); __printf(3, 4) extern void affs_error(struct super_block *sb, const char *function, const char *fmt, ...); __printf(3, 4) extern void affs_warning(struct super_block *sb, const char *function, const char *fmt, ...); extern bool affs_nofilenametruncate(const struct dentry *dentry); extern int affs_check_name(const unsigned char *name, int len, bool notruncate); extern int affs_copy_name(unsigned char *bstr, struct dentry *dentry); /* bitmap. c */ extern u32 affs_count_free_blocks(struct super_block *s); extern void affs_free_block(struct super_block *sb, u32 block); extern u32 affs_alloc_block(struct inode *inode, u32 goal); extern int affs_init_bitmap(struct super_block *sb, int *flags); extern void affs_free_bitmap(struct super_block *sb); /* namei.c */ extern const struct export_operations affs_export_ops; extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int); extern int affs_unlink(struct inode *dir, struct dentry *dentry); extern int affs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool); extern int affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, struct dentry *dentry); extern int affs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname); extern int affs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); /* inode.c */ extern struct inode *affs_new_inode(struct inode *dir); extern int affs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); extern void affs_evict_inode(struct inode *inode); extern struct inode *affs_iget(struct super_block *sb, unsigned long ino); extern int affs_write_inode(struct inode *inode, struct writeback_control *wbc); extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); /* file.c */ void affs_free_prealloc(struct inode *inode); extern void affs_truncate(struct inode *); int affs_file_fsync(struct file *, loff_t, loff_t, int); /* dir.c */ extern void affs_dir_truncate(struct inode *); /* jump tables */ extern const struct inode_operations affs_file_inode_operations; extern const struct inode_operations affs_dir_inode_operations; extern const struct inode_operations affs_symlink_inode_operations; extern const struct file_operations affs_file_operations; extern const struct file_operations affs_file_operations_ofs; extern const struct file_operations affs_dir_operations; extern const struct address_space_operations affs_symlink_aops; extern const struct address_space_operations affs_aops; extern const struct address_space_operations affs_aops_ofs; extern const struct dentry_operations affs_dentry_operations; extern const struct dentry_operations affs_intl_dentry_operations; static inline bool affs_validblock(struct super_block *sb, int block) { return(block >= AFFS_SB(sb)->s_reserved && block < AFFS_SB(sb)->s_partition_size); } static inline void affs_set_blocksize(struct super_block *sb, int size) { sb_set_blocksize(sb, size); } static inline struct buffer_head * affs_bread(struct super_block *sb, int block) { pr_debug("%s: %d\n", __func__, block); if (affs_validblock(sb, block)) return sb_bread(sb, block); return NULL; } static inline struct buffer_head * affs_getblk(struct super_block *sb, int block) { pr_debug("%s: %d\n", __func__, block); if (affs_validblock(sb, block)) return sb_getblk(sb, block); return NULL; } static inline struct buffer_head * affs_getzeroblk(struct super_block *sb, int block) { struct buffer_head *bh; pr_debug("%s: %d\n", __func__, block); if (affs_validblock(sb, block)) { bh = sb_getblk(sb, block); lock_buffer(bh); memset(bh->b_data, 0 , sb->s_blocksize); set_buffer_uptodate(bh); unlock_buffer(bh); return bh; } return NULL; } static inline struct buffer_head * affs_getemptyblk(struct super_block *sb, int block) { struct buffer_head *bh; pr_debug("%s: %d\n", __func__, block); if (affs_validblock(sb, block)) { bh = sb_getblk(sb, block); wait_on_buffer(bh); set_buffer_uptodate(bh); return bh; } return NULL; } static inline void affs_brelse(struct buffer_head *bh) { if (bh) pr_debug("%s: %lld\n", __func__, (long long) bh->b_blocknr); brelse(bh); } static inline void affs_adjust_checksum(struct buffer_head *bh, u32 val) { u32 tmp = be32_to_cpu(((__be32 *)bh->b_data)[5]); ((__be32 *)bh->b_data)[5] = cpu_to_be32(tmp - val); } static inline void affs_adjust_bitmapchecksum(struct buffer_head *bh, u32 val) { u32 tmp = be32_to_cpu(((__be32 *)bh->b_data)[0]); ((__be32 *)bh->b_data)[0] = cpu_to_be32(tmp - val); } static inline void affs_lock_link(struct inode *inode) { mutex_lock(&AFFS_I(inode)->i_link_lock); } static inline void affs_unlock_link(struct inode *inode) { mutex_unlock(&AFFS_I(inode)->i_link_lock); } static inline void affs_lock_dir(struct inode *inode) { mutex_lock_nested(&AFFS_I(inode)->i_hash_lock, SINGLE_DEPTH_NESTING); } static inline void affs_unlock_dir(struct inode *inode) { mutex_unlock(&AFFS_I(inode)->i_hash_lock); } static inline void affs_lock_ext(struct inode *inode) { mutex_lock(&AFFS_I(inode)->i_ext_lock); } static inline void affs_unlock_ext(struct inode *inode) { mutex_unlock(&AFFS_I(inode)->i_ext_lock); } |
4667 4658 547 548 539 540 540 541 4 1 5 3 2 146 146 140 5 4261 2978 3649 8972 8965 31 4228 9 535 3 540 6 547 547 546 547 535 536 548 546 545 46 46 44 45 197 152 46 2 44 48 3 45 7 8949 3 8957 8949 14 8949 111 3 4 8864 8885 36 8891 3773 3773 3777 851 3727 1789 8177 8172 8172 8181 2258 2255 2254 91 754 60 25 42 4 2 2 830 832 804 743 198 836 2 798 4 832 802 32 1406 778 778 778 18 5435 55109 14 55562 55516 55575 55562 3980 70 172 6 170 13 13 13 12 13 1283 238 5153 52441 47447 53274 1135 1141 5228 2103 7217 7222 7199 2070 8 8 54 54 54 54 141 137 141 141 1 8 1 7 30 3 4 6 19 2 25 8 5 3 149 18 15 136 6 134 112 49 13 13 13 9 6 36 22 9 17 1 16 2 1 14 4 1 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/file.c * * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes * * Manage the dynamic fd arrays in the process files_struct. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/close_range.h> #include <linux/file_ref.h> #include <net/sock.h> #include <linux/init_task.h> #include "internal.h" /** * __file_ref_put - Slowpath of file_ref_put() * @ref: Pointer to the reference count * @cnt: Current reference count * * Invoked when the reference count is outside of the valid zone. * * Return: * True if this was the last reference with no future references * possible. This signals the caller that it can safely schedule the * object, which is protected by the reference counter, for * deconstruction. * * False if there are still active references or the put() raced * with a concurrent get()/put() pair. Caller is not allowed to * deconstruct the protected object. */ bool __file_ref_put(file_ref_t *ref, unsigned long cnt) { /* Did this drop the last reference? */ if (likely(cnt == FILE_REF_NOREF)) { /* * Carefully try to set the reference count to FILE_REF_DEAD. * * This can fail if a concurrent get() operation has * elevated it again or the corresponding put() even marked * it dead already. Both are valid situations and do not * require a retry. If this fails the caller is not * allowed to deconstruct the object. */ if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD)) return false; /* * The caller can safely schedule the object for * deconstruction. Provide acquire ordering. */ smp_acquire__after_ctrl_dep(); return true; } /* * If the reference count was already in the dead zone, then this * put() operation is imbalanced. Warn, put the reference count back to * DEAD and tell the caller to not deconstruct the object. */ if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { atomic_long_set(&ref->refcnt, FILE_REF_DEAD); return false; } /* * This is a put() operation on a saturated refcount. Restore the * mean saturation value and tell the caller to not deconstruct the * object. */ if (cnt > FILE_REF_MAXREF) atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); return false; } EXPORT_SYMBOL_GPL(__file_ref_put); unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; /* our min() is unusable in constant expressions ;-/ */ #define __const_min(x, y) ((x) < (y) ? (x) : (y)) unsigned int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void __free_fdtable(struct fdtable *fdt) { kvfree(fdt->fd); kvfree(fdt->open_fds); kfree(fdt); } static void free_fdtable_rcu(struct rcu_head *rcu) { __free_fdtable(container_of(rcu, struct fdtable, rcu)); } #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) #define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds /* * Copy 'count' fd bits from the old table to the new table and clear the extra * space if any. This does not copy the file pointers. Called with the files * spinlock held for write. */ static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, unsigned int copy_words) { unsigned int nwords = fdt_words(nfdt); bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, copy_words, nwords); } /* * Copy all file descriptors from the old table to the new, expanded table and * clear the extra space. Called with the files spinlock held for write. */ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { size_t cpy, set; BUG_ON(nfdt->max_fds < ofdt->max_fds); cpy = ofdt->max_fds * sizeof(struct file *); set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); memcpy(nfdt->fd, ofdt->fd, cpy); memset((char *)nfdt->fd + cpy, 0, set); copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); } /* * Note how the fdtable bitmap allocations very much have to be a multiple of * BITS_PER_LONG. This is not only because we walk those things in chunks of * 'unsigned long' in some places, but simply because that is how the Linux * kernel bitmaps are defined to work: they are not "bits in an array of bytes", * they are very much "bits in an array of unsigned long". */ static struct fdtable *alloc_fdtable(unsigned int slots_wanted) { struct fdtable *fdt; unsigned int nr; void *data; /* * Figure out how many fds we actually want to support in this fdtable. * Allocation steps are keyed to the size of the fdarray, since it * grows far faster than any of the other dynamic data. We try to fit * the fdarray into comfortable page-tuned chunks: starting at 1024B * and growing in powers of two from there on. Since we called only * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab * already gives BITS_PER_LONG slots), the above boils down to * 1. use the smallest power of two large enough to give us that many * slots. * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is * 256 slots (i.e. 1Kb fd array). * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there * and we are never going to be asked for 64 or less. */ if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256) nr = 256; else nr = roundup_pow_of_two(slots_wanted); /* * Note that this can drive nr *below* what we had passed if sysctl_nr_open * had been set lower between the check in expand_files() and here. * * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise * bitmaps handling below becomes unpleasant, to put it mildly... */ if (unlikely(nr > sysctl_nr_open)) { nr = round_down(sysctl_nr_open, BITS_PER_LONG); if (nr < slots_wanted) return ERR_PTR(-EMFILE); } fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT); if (!data) goto out_fdt; fdt->fd = data; data = kvmalloc(max_t(size_t, 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES), GFP_KERNEL_ACCOUNT); if (!data) goto out_arr; fdt->open_fds = data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = data; data += nr / BITS_PER_BYTE; fdt->full_fds_bits = data; return fdt; out_arr: kvfree(fdt->fd); out_fdt: kfree(fdt); out: return ERR_PTR(-ENOMEM); } /* * Expand the file descriptor table. * This function will allocate a new fdtable and both fd array and fdset, of * the given size. * Return <0 error code on error; 0 on successful completion. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_fdtable(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *new_fdt, *cur_fdt; spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr + 1); /* make sure all fd_install() have seen resize_in_progress * or have finished their rcu_read_lock_sched() section. */ if (atomic_read(&files->count) > 1) synchronize_rcu(); spin_lock(&files->file_lock); if (IS_ERR(new_fdt)) return PTR_ERR(new_fdt); cur_fdt = files_fdtable(files); BUG_ON(nr < cur_fdt->max_fds); copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); if (cur_fdt != &files->fdtab) call_rcu(&cur_fdt->rcu, free_fdtable_rcu); /* coupled with smp_rmb() in fd_install() */ smp_wmb(); return 0; } /* * Expand files. * This function will expand the file structures, if the requested size exceeds * the current capacity and there is room for expansion. * Return <0 error code on error; 0 on success. * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, unsigned int nr) __releases(files->file_lock) __acquires(files->file_lock) { struct fdtable *fdt; int error; repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) return 0; if (unlikely(files->resize_in_progress)) { spin_unlock(&files->file_lock); wait_event(files->resize_wait, !files->resize_in_progress); spin_lock(&files->file_lock); goto repeat; } /* Can we expand? */ if (unlikely(nr >= sysctl_nr_open)) return -EMFILE; /* All good, so we try */ files->resize_in_progress = true; error = expand_fdtable(files, nr); files->resize_in_progress = false; wake_up_all(&files->resize_wait); return error; } static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt, bool set) { if (set) { __set_bit(fd, fdt->close_on_exec); } else { if (test_bit(fd, fdt->close_on_exec)) __clear_bit(fd, fdt->close_on_exec); } } static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set) { __set_bit(fd, fdt->open_fds); __set_close_on_exec(fd, fdt, set); fd /= BITS_PER_LONG; if (!~fdt->open_fds[fd]) __set_bit(fd, fdt->full_fds_bits); } static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt) { __clear_bit(fd, fdt->open_fds); fd /= BITS_PER_LONG; if (test_bit(fd, fdt->full_fds_bits)) __clear_bit(fd, fdt->full_fds_bits); } static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt) { return test_bit(fd, fdt->open_fds); } /* * Note that a sane fdtable size always has to be a multiple of * BITS_PER_LONG, since we have bitmaps that are sized by this. * * punch_hole is optional - when close_range() is asked to unshare * and close, we don't need to copy descriptors in that range, so * a smaller cloned descriptor table might suffice if the last * currently opened descriptor falls into that range. */ static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole) { unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds); if (last == fdt->max_fds) return NR_OPEN_DEFAULT; if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) { last = find_last_bit(fdt->open_fds, punch_hole->from); if (last == punch_hole->from) return NR_OPEN_DEFAULT; } return ALIGN(last + 1, BITS_PER_LONG); } /* * Allocate a new descriptor table and copy contents from the passed in * instance. Returns a pointer to cloned table on success, ERR_PTR() * on failure. For 'punch_hole' see sane_fdtable_size(). */ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole) { struct files_struct *newf; struct file **old_fds, **new_fds; unsigned int open_files, i; struct fdtable *old_fdt, *new_fdt; newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); if (!newf) return ERR_PTR(-ENOMEM); atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); newf->resize_in_progress = false; init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; new_fdt->close_on_exec = newf->close_on_exec_init; new_fdt->open_fds = newf->open_fds_init; new_fdt->full_fds_bits = newf->full_fds_bits_init; new_fdt->fd = &newf->fd_array[0]; spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); /* * Check whether we need to allocate a larger fd array and fd set. */ while (unlikely(open_files > new_fdt->max_fds)) { spin_unlock(&oldf->file_lock); if (new_fdt != &newf->fdtab) __free_fdtable(new_fdt); new_fdt = alloc_fdtable(open_files); if (IS_ERR(new_fdt)) { kmem_cache_free(files_cachep, newf); return ERR_CAST(new_fdt); } /* * Reacquire the oldf lock and a pointer to its fd table * who knows it may have a new bigger fd table. We need * the latest pointer. */ spin_lock(&oldf->file_lock); old_fdt = files_fdtable(oldf); open_files = sane_fdtable_size(old_fdt, punch_hole); } copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); old_fds = old_fdt->fd; new_fds = new_fdt->fd; for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; if (f) { get_file(f); } else { /* * The fd may be claimed in the fd bitmap but not yet * instantiated in the files array if a sibling thread * is partway through open(). So make sure that this * fd is available to the new process. */ __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* clear the remainder */ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *)); rcu_assign_pointer(newf->fdt, new_fdt); return newf; } static struct fdtable *close_files(struct files_struct * files) { /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the * files structure. */ struct fdtable *fdt = rcu_dereference_raw(files->fdt); unsigned int i, j = 0; for (;;) { unsigned long set; i = j * BITS_PER_LONG; if (i >= fdt->max_fds) break; set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file *file = fdt->fd[i]; if (file) { filp_close(file, files); cond_resched(); } } i++; set >>= 1; } } return fdt; } void put_files_struct(struct files_struct *files) { if (atomic_dec_and_test(&files->count)) { struct fdtable *fdt = close_files(files); /* free the arrays if they are not embedded */ if (fdt != &files->fdtab) __free_fdtable(fdt); kmem_cache_free(files_cachep, files); } } void exit_files(struct task_struct *tsk) { struct files_struct * files = tsk->files; if (files) { task_lock(tsk); tsk->files = NULL; task_unlock(tsk); put_files_struct(files); } } struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], .close_on_exec = init_files.close_on_exec_init, .open_fds = init_files.open_fds_init, .full_fds_bits = init_files.full_fds_bits_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), }; static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) { unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */ unsigned int maxbit = maxfd / BITS_PER_LONG; unsigned int bitbit = start / BITS_PER_LONG; unsigned int bit; /* * Try to avoid looking at the second level bitmap */ bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG, start & (BITS_PER_LONG - 1)); if (bit < BITS_PER_LONG) return bit + bitbit * BITS_PER_LONG; bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG; if (bitbit >= maxfd) return maxfd; if (bitbit > start) start = bitbit; return find_next_zero_bit(fdt->open_fds, maxfd, start); } /* * allocate a file descriptor, mark it busy. */ static int alloc_fd(unsigned start, unsigned end, unsigned flags) { struct files_struct *files = current->files; unsigned int fd; int error; struct fdtable *fdt; spin_lock(&files->file_lock); repeat: fdt = files_fdtable(files); fd = start; if (fd < files->next_fd) fd = files->next_fd; if (likely(fd < fdt->max_fds)) fd = find_next_fd(fdt, fd); /* * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ error = -EMFILE; if (unlikely(fd >= end)) goto out; if (unlikely(fd >= fdt->max_fds)) { error = expand_files(files, fd); if (error < 0) goto out; goto repeat; } if (start <= files->next_fd) files->next_fd = fd + 1; __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; out: spin_unlock(&files->file_lock); return error; } int __get_unused_fd_flags(unsigned flags, unsigned long nofile) { return alloc_fd(0, nofile, flags); } int get_unused_fd_flags(unsigned flags) { return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE)); } EXPORT_SYMBOL(get_unused_fd_flags); static void __put_unused_fd(struct files_struct *files, unsigned int fd) { struct fdtable *fdt = files_fdtable(files); __clear_open_fd(fd, fdt); if (fd < files->next_fd) files->next_fd = fd; } void put_unused_fd(unsigned int fd) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); } EXPORT_SYMBOL(put_unused_fd); /* * Install a file pointer in the fd array. * * The VFS is full of places where we drop the files lock between * setting the open_fds bitmap and installing the file in the file * array. At any such point, we are vulnerable to a dup2() race * installing a file in the array before us. We need to detect this and * fput() the struct file we are about to overwrite in this case. * * It should never happen - if we allow dup2() do it, _really_ bad things * will follow. * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; struct fdtable *fdt; if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) return; rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); WARN_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } EXPORT_SYMBOL(fd_install); /** * file_close_fd_locked - return file associated with fd * @files: file struct to retrieve file from * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Context: files_lock must be held. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) { struct fdtable *fdt = files_fdtable(files); struct file *file; lockdep_assert_held(&files->file_lock); if (fd >= fdt->max_fds) return NULL; fd = array_index_nospec(fd, fdt->max_fds); file = fdt->fd[fd]; if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); } return file; } int close_fd(unsigned fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); if (!file) return -EBADF; return filp_close(file, files); } EXPORT_SYMBOL(close_fd); /** * last_fd - return last valid index into fd table * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * * Returns: Last valid index into fdtable. */ static inline unsigned last_fd(struct fdtable *fdt) { return fdt->max_fds - 1; } static inline void __range_cloexec(struct files_struct *cur_fds, unsigned int fd, unsigned int max_fd) { struct fdtable *fdt; /* make sure we're using the correct maximum value */ spin_lock(&cur_fds->file_lock); fdt = files_fdtable(cur_fds); max_fd = min(last_fd(fdt), max_fd); if (fd <= max_fd) bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1); spin_unlock(&cur_fds->file_lock); } static inline void __range_close(struct files_struct *files, unsigned int fd, unsigned int max_fd) { struct file *file; unsigned n; spin_lock(&files->file_lock); n = last_fd(files_fdtable(files)); max_fd = min(max_fd, n); for (; fd <= max_fd; fd++) { file = file_close_fd_locked(files, fd); if (file) { spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } else if (need_resched()) { spin_unlock(&files->file_lock); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } /** * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. * Currently, errors to close a given file descriptor are ignored. */ SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd, unsigned int, flags) { struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC)) return -EINVAL; if (fd > max_fd) return -EINVAL; if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) { struct fd_range range = {fd, max_fd}, *punch_hole = ⦥ /* * If the caller requested all fds to be made cloexec we always * copy all of the file descriptors since they still want to * use them. */ if (flags & CLOSE_RANGE_CLOEXEC) punch_hole = NULL; fds = dup_fd(cur_fds, punch_hole); if (IS_ERR(fds)) return PTR_ERR(fds); /* * We used to share our file descriptor table, and have now * created a private one, make sure we're using it below. */ swap(cur_fds, fds); } if (flags & CLOSE_RANGE_CLOEXEC) __range_cloexec(cur_fds, fd, max_fd); else __range_close(cur_fds, fd, max_fd); if (fds) { /* * We're done closing the files we were supposed to. Time to install * the new file descriptor table and drop the old one. */ task_lock(me); me->files = cur_fds; task_unlock(me); put_files_struct(fds); } return 0; } /** * file_close_fd - return file associated with fd * @fd: file descriptor to retrieve file for * * Doesn't take a separate reference count. * * Returns: The file associated with @fd (NULL if @fd is not open) */ struct file *file_close_fd(unsigned int fd) { struct files_struct *files = current->files; struct file *file; spin_lock(&files->file_lock); file = file_close_fd_locked(files, fd); spin_unlock(&files->file_lock); return file; } void do_close_on_exec(struct files_struct *files) { unsigned i; struct fdtable *fdt; /* exec unshares first */ spin_lock(&files->file_lock); for (i = 0; ; i++) { unsigned long set; unsigned fd = i * BITS_PER_LONG; fdt = files_fdtable(files); if (fd >= fdt->max_fds) break; set = fdt->close_on_exec[i]; if (!set) continue; fdt->close_on_exec[i] = 0; for ( ; set ; fd++, set >>= 1) { struct file *file; if (!(set & 1)) continue; file = fdt->fd[fd]; if (!file) continue; rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); filp_close(file, files); cond_resched(); spin_lock(&files->file_lock); } } spin_unlock(&files->file_lock); } static struct file *__get_file_rcu(struct file __rcu **f) { struct file __rcu *file; struct file __rcu *file_reloaded; struct file __rcu *file_reloaded_cmp; file = rcu_dereference_raw(*f); if (!file) return NULL; if (unlikely(!file_ref_get(&file->f_ref))) return ERR_PTR(-EAGAIN); file_reloaded = rcu_dereference_raw(*f); /* * Ensure that all accesses have a dependency on the load from * rcu_dereference_raw() above so we get correct ordering * between reuse/allocation and the pointer check below. */ file_reloaded_cmp = file_reloaded; OPTIMIZER_HIDE_VAR(file_reloaded_cmp); /* * file_ref_get() above provided a full memory barrier when we * acquired a reference. * * This is paired with the write barrier from assigning to the * __rcu protected file pointer so that if that pointer still * matches the current file, we know we have successfully * acquired a reference to the right file. * * If the pointers don't match the file has been reallocated by * SLAB_TYPESAFE_BY_RCU. */ if (file == file_reloaded_cmp) return file_reloaded; fput(file); return ERR_PTR(-EAGAIN); } /** * get_file_rcu - try go get a reference to a file under rcu * @f: the file to get a reference on * * This function tries to get a reference on @f carefully verifying that * @f hasn't been reused. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_rcu(struct file __rcu **f) { for (;;) { struct file __rcu *file; file = __get_file_rcu(f); if (!IS_ERR(file)) return file; } } EXPORT_SYMBOL_GPL(get_file_rcu); /** * get_file_active - try go get a reference to a file * @f: the file to get a reference on * * In contast to get_file_rcu() the pointer itself isn't part of the * reference counting. * * This function should rarely have to be used and only by users who * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. * * Return: Returns @f with the reference count increased or NULL. */ struct file *get_file_active(struct file **f) { struct file __rcu *file; rcu_read_lock(); file = __get_file_rcu(f); rcu_read_unlock(); if (IS_ERR(file)) file = NULL; return file; } EXPORT_SYMBOL_GPL(get_file_active); static inline struct file *__fget_files_rcu(struct files_struct *files, unsigned int fd, fmode_t mask) { for (;;) { struct file *file; struct fdtable *fdt = rcu_dereference_raw(files->fdt); struct file __rcu **fdentry; unsigned long nospec_mask; /* Mask is a 0 for invalid fd's, ~0 for valid ones */ nospec_mask = array_index_mask_nospec(fd, fdt->max_fds); /* * fdentry points to the 'fd' offset, or fdt->fd[0]. * Loading from fdt->fd[0] is always safe, because the * array always exists. */ fdentry = fdt->fd + (fd & nospec_mask); /* Do the load, then mask any invalid result */ file = rcu_dereference_raw(*fdentry); file = (void *)(nospec_mask & (unsigned long)file); if (unlikely(!file)) return NULL; /* * Ok, we have a file pointer that was valid at * some point, but it might have become stale since. * * We need to confirm it by incrementing the refcount * and then check the lookup again. * * file_ref_get() gives us a full memory barrier. We * only really need an 'acquire' one to protect the * loads below, but we don't have that. */ if (unlikely(!file_ref_get(&file->f_ref))) continue; /* * Such a race can take two forms: * * (a) the file ref already went down to zero and the * file hasn't been reused yet or the file count * isn't zero but the file has already been reused. * * (b) the file table entry has changed under us. * Note that we don't need to re-check the 'fdt->fd' * pointer having changed, because it always goes * hand-in-hand with 'fdt'. * * If so, we need to put our ref and try again. */ if (unlikely(file != rcu_dereference_raw(*fdentry)) || unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } /* * This isn't the file we're looking for or we're not * allowed to get a reference to it. */ if (unlikely(file->f_mode & mask)) { fput(file); return NULL; } /* * Ok, we have a ref to the file, and checked that it * still exists. */ return file; } } static struct file *__fget_files(struct files_struct *files, unsigned int fd, fmode_t mask) { struct file *file; rcu_read_lock(); file = __fget_files_rcu(files, fd, mask); rcu_read_unlock(); return file; } static inline struct file *__fget(unsigned int fd, fmode_t mask) { return __fget_files(current->files, fd, mask); } struct file *fget(unsigned int fd) { return __fget(fd, FMODE_PATH); } EXPORT_SYMBOL(fget); struct file *fget_raw(unsigned int fd) { return __fget(fd, 0); } EXPORT_SYMBOL(fget_raw); struct file *fget_task(struct task_struct *task, unsigned int fd) { struct file *file = NULL; task_lock(task); if (task->files) file = __fget_files(task->files, fd, 0); task_unlock(task); return file; } struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; unsigned int fd = *ret_fd; struct file *file = NULL; task_lock(task); files = task->files; if (files) { rcu_read_lock(); for (; fd < files_fdtable(files)->max_fds; fd++) { file = __fget_files_rcu(files, fd, 0); if (file) break; } rcu_read_unlock(); } task_unlock(task); *ret_fd = fd; return file; } EXPORT_SYMBOL(fget_task_next); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. * * You can use this instead of fget if you satisfy all of the following * conditions: * 1) You must call fput_light before exiting the syscall and returning control * to userspace (i.e. you cannot remember the returned struct file * after * returning to userspace). * 2) You must not call filp_close on the returned struct file * in between * calls to fget_light and fput_light. * 3) You must not clone the current task in between the calls to fget_light * and fput_light. * * The fput_needed flag returned by fget_light should be passed to the * corresponding fput_light. * * (As an exception to rule 2, you can call filp_close between fget_light and * fput_light provided that you capture a real refcount with get_file before * the call to filp_close, and ensure that this real refcount is fput *after* * the fput_light call.) * * See also the documentation in rust/kernel/file.rs. */ static inline struct fd __fget_light(unsigned int fd, fmode_t mask) { struct files_struct *files = current->files; struct file *file; /* * If another thread is concurrently calling close_fd() followed * by put_files_struct(), we must not observe the old table * entry combined with the new refcount - otherwise we could * return a file that is concurrently being freed. * * atomic_read_acquire() pairs with atomic_dec_and_test() in * put_files_struct(). */ if (likely(atomic_read_acquire(&files->count) == 1)) { file = files_lookup_fd_raw(files, fd); if (!file || unlikely(file->f_mode & mask)) return EMPTY_FD; return BORROWED_FD(file); } else { file = __fget_files(files, fd, mask); if (!file) return EMPTY_FD; return CLONED_FD(file); } } struct fd fdget(unsigned int fd) { return __fget_light(fd, FMODE_PATH); } EXPORT_SYMBOL(fdget); struct fd fdget_raw(unsigned int fd) { return __fget_light(fd, 0); } /* * Try to avoid f_pos locking. We only need it if the * file is marked for FMODE_ATOMIC_POS, and it can be * accessed multiple ways. * * Always do it for directories, because pidfd_getfd() * can make a file accessible even if it otherwise would * not be, and for directories this is a correctness * issue, not a "POSIX requirement". */ static inline bool file_needs_f_pos_lock(struct file *file) { return (file->f_mode & FMODE_ATOMIC_POS) && (file_count(file) > 1 || file->f_op->iterate_shared); } struct fd fdget_pos(unsigned int fd) { struct fd f = fdget(fd); struct file *file = fd_file(f); if (file && file_needs_f_pos_lock(file)) { f.word |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } return f; } void __f_unlock_pos(struct file *f) { mutex_unlock(&f->f_pos_lock); } /* * We only lock f_pos if we have threads or if the file might be * shared with another process. In both cases we'll have an elevated * file count (done either by fdget() or by fork()). */ void set_close_on_exec(unsigned int fd, int flag) { struct files_struct *files = current->files; spin_lock(&files->file_lock); __set_close_on_exec(fd, files_fdtable(files), flag); spin_unlock(&files->file_lock); } bool get_close_on_exec(unsigned int fd) { bool res; rcu_read_lock(); res = close_on_exec(fd, current->files); rcu_read_unlock(); return res; } static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) __releases(&files->file_lock) { struct file *tofree; struct fdtable *fdt; /* * We need to detect attempts to do dup2() over allocated but still * not finished descriptor. * * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); tofree = fdt->fd[fd]; if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); rcu_assign_pointer(fdt->fd[fd], file); __set_open_fd(fd, fdt, flags & O_CLOEXEC); spin_unlock(&files->file_lock); if (tofree) filp_close(tofree, files); return fd; Ebusy: spin_unlock(&files->file_lock); return -EBUSY; } int replace_fd(unsigned fd, struct file *file, unsigned flags) { int err; struct files_struct *files = current->files; if (!file) return close_fd(fd); if (fd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; return do_dup2(files, file, fd, flags); out_unlock: spin_unlock(&files->file_lock); return err; } /** * receive_fd() - Install received file into file descriptor table * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry * * Installs a received file into the file descriptor table, with appropriate * checks and count updates. Optionally writes the fd number to userspace, if * @ufd is non-NULL. * * This helper handles its own reference counting of the incoming * struct file. * * Returns newly install fd or -ve on error. */ int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; error = security_file_receive(file); if (error) return error; new_fd = get_unused_fd_flags(o_flags); if (new_fd < 0) return new_fd; if (ufd) { error = put_user(new_fd, ufd); if (error) { put_unused_fd(new_fd); return error; } } fd_install(new_fd, get_file(file)); __receive_sock(file); return new_fd; } EXPORT_SYMBOL_GPL(receive_fd); int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) { int error; error = security_file_receive(file); if (error) return error; error = replace_fd(new_fd, file, o_flags); if (error) return error; __receive_sock(file); return new_fd; } static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file *file; struct files_struct *files = current->files; if ((flags & ~O_CLOEXEC) != 0) return -EINVAL; if (unlikely(oldfd == newfd)) return -EINVAL; if (newfd >= rlimit(RLIMIT_NOFILE)) return -EBADF; spin_lock(&files->file_lock); err = expand_files(files, newfd); file = files_lookup_fd_locked(files, oldfd); if (unlikely(!file)) goto Ebadf; if (unlikely(err < 0)) { if (err == -EMFILE) goto Ebadf; goto out_unlock; } return do_dup2(files, file, newfd, flags); Ebadf: err = -EBADF; out_unlock: spin_unlock(&files->file_lock); return err; } SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) { return ksys_dup3(oldfd, newfd, flags); } SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; struct file *f; int retval = oldfd; rcu_read_lock(); f = __fget_files_rcu(files, oldfd, 0); if (!f) retval = -EBADF; rcu_read_unlock(); if (f) fput(f); return retval; } return ksys_dup3(oldfd, newfd, 0); } SYSCALL_DEFINE1(dup, unsigned int, fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); if (file) { ret = get_unused_fd_flags(0); if (ret >= 0) fd_install(ret, file); else fput(file); } return ret; } int f_dupfd(unsigned int from, struct file *file, unsigned flags) { unsigned long nofile = rlimit(RLIMIT_NOFILE); int err; if (from >= nofile) return -EINVAL; err = alloc_fd(from, nofile, flags); if (err >= 0) { get_file(file); fd_install(err, file); } return err; } int iterate_fd(struct files_struct *files, unsigned n, int (*f)(const void *, struct file *, unsigned), const void *p) { struct fdtable *fdt; int res = 0; if (!files) return 0; spin_lock(&files->file_lock); for (fdt = files_fdtable(files); n < fdt->max_fds; n++) { struct file *file; file = rcu_dereference_check_fdtable(files, fdt->fd[n]); if (!file) continue; res = f(p, file, n); if (res) break; } spin_unlock(&files->file_lock); return res; } EXPORT_SYMBOL(iterate_fd); |
4 19 22 22 22 22 2 22 42 31 35 38 7 58 58 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | /* * Linear conversion Plug-In * Copyright (c) 1999 by Jaroslav Kysela <perex@perex.cz>, * Abramo Bagnara <abramo@alsa-project.org> * * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/time.h> #include <sound/core.h> #include <sound/pcm.h> #include "pcm_plugin.h" /* * Basic linear conversion plugin */ struct linear_priv { int cvt_endian; /* need endian conversion? */ unsigned int src_ofs; /* byte offset in source format */ unsigned int dst_ofs; /* byte soffset in destination format */ unsigned int copy_ofs; /* byte offset in temporary u32 data */ unsigned int dst_bytes; /* byte size of destination format */ unsigned int copy_bytes; /* bytes to copy per conversion */ unsigned int flip; /* MSB flip for signeness, done after endian conv */ }; static inline void do_convert(struct linear_priv *data, unsigned char *dst, unsigned char *src) { unsigned int tmp = 0; unsigned char *p = (unsigned char *)&tmp; memcpy(p + data->copy_ofs, src + data->src_ofs, data->copy_bytes); if (data->cvt_endian) tmp = swab32(tmp); tmp ^= data->flip; memcpy(dst, p + data->dst_ofs, data->dst_bytes); } static void convert(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { struct linear_priv *data = (struct linear_priv *)plugin->extra_data; int channel; int nchannels = plugin->src_format.channels; for (channel = 0; channel < nchannels; ++channel) { char *src; char *dst; int src_step, dst_step; snd_pcm_uframes_t frames1; if (!src_channels[channel].enabled) { if (dst_channels[channel].wanted) snd_pcm_area_silence(&dst_channels[channel].area, 0, frames, plugin->dst_format.format); dst_channels[channel].enabled = 0; continue; } dst_channels[channel].enabled = 1; src = src_channels[channel].area.addr + src_channels[channel].area.first / 8; dst = dst_channels[channel].area.addr + dst_channels[channel].area.first / 8; src_step = src_channels[channel].area.step / 8; dst_step = dst_channels[channel].area.step / 8; frames1 = frames; while (frames1-- > 0) { do_convert(data, dst, src); src += src_step; dst += dst_step; } } } static snd_pcm_sframes_t linear_transfer(struct snd_pcm_plugin *plugin, const struct snd_pcm_plugin_channel *src_channels, struct snd_pcm_plugin_channel *dst_channels, snd_pcm_uframes_t frames) { if (snd_BUG_ON(!plugin || !src_channels || !dst_channels)) return -ENXIO; if (frames == 0) return 0; #ifdef CONFIG_SND_DEBUG { unsigned int channel; for (channel = 0; channel < plugin->src_format.channels; channel++) { if (snd_BUG_ON(src_channels[channel].area.first % 8 || src_channels[channel].area.step % 8)) return -ENXIO; if (snd_BUG_ON(dst_channels[channel].area.first % 8 || dst_channels[channel].area.step % 8)) return -ENXIO; } } #endif if (frames > dst_channels[0].frames) frames = dst_channels[0].frames; convert(plugin, src_channels, dst_channels, frames); return frames; } static void init_data(struct linear_priv *data, snd_pcm_format_t src_format, snd_pcm_format_t dst_format) { int src_le, dst_le, src_bytes, dst_bytes; src_bytes = snd_pcm_format_width(src_format) / 8; dst_bytes = snd_pcm_format_width(dst_format) / 8; src_le = snd_pcm_format_little_endian(src_format) > 0; dst_le = snd_pcm_format_little_endian(dst_format) > 0; data->dst_bytes = dst_bytes; data->cvt_endian = src_le != dst_le; data->copy_bytes = src_bytes < dst_bytes ? src_bytes : dst_bytes; if (src_le) { data->copy_ofs = 4 - data->copy_bytes; data->src_ofs = src_bytes - data->copy_bytes; } else data->src_ofs = snd_pcm_format_physical_width(src_format) / 8 - src_bytes; if (dst_le) data->dst_ofs = 4 - data->dst_bytes; else data->dst_ofs = snd_pcm_format_physical_width(dst_format) / 8 - dst_bytes; if (snd_pcm_format_signed(src_format) != snd_pcm_format_signed(dst_format)) { if (dst_le) data->flip = (__force u32)cpu_to_le32(0x80000000); else data->flip = (__force u32)cpu_to_be32(0x80000000); } } int snd_pcm_plugin_build_linear(struct snd_pcm_substream *plug, struct snd_pcm_plugin_format *src_format, struct snd_pcm_plugin_format *dst_format, struct snd_pcm_plugin **r_plugin) { int err; struct linear_priv *data; struct snd_pcm_plugin *plugin; if (snd_BUG_ON(!r_plugin)) return -ENXIO; *r_plugin = NULL; if (snd_BUG_ON(src_format->rate != dst_format->rate)) return -ENXIO; if (snd_BUG_ON(src_format->channels != dst_format->channels)) return -ENXIO; if (snd_BUG_ON(!snd_pcm_format_linear(src_format->format) || !snd_pcm_format_linear(dst_format->format))) return -ENXIO; err = snd_pcm_plugin_build(plug, "linear format conversion", src_format, dst_format, sizeof(struct linear_priv), &plugin); if (err < 0) return err; data = (struct linear_priv *)plugin->extra_data; init_data(data, src_format->format, dst_format->format); plugin->transfer = linear_transfer; *r_plugin = plugin; return 0; } |
18 1 3 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 | // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/module.h> #include <net/ip.h> #include <linux/ipv6.h> #include <linux/icmp.h> #include <net/ipv6.h> #include <net/tcp.h> #include <net/udp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_tcpudp.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> MODULE_DESCRIPTION("Xtables: TCP, UDP and UDP-Lite match"); MODULE_LICENSE("GPL"); MODULE_ALIAS("xt_tcp"); MODULE_ALIAS("xt_udp"); MODULE_ALIAS("ipt_udp"); MODULE_ALIAS("ipt_tcp"); MODULE_ALIAS("ip6t_udp"); MODULE_ALIAS("ip6t_tcp"); MODULE_ALIAS("ipt_icmp"); MODULE_ALIAS("ip6t_icmp6"); /* Returns 1 if the port is matched by the range, 0 otherwise */ static inline bool port_match(u_int16_t min, u_int16_t max, u_int16_t port, bool invert) { return (port >= min && port <= max) ^ invert; } static bool tcp_find_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, unsigned int optlen, bool invert, bool *hotdrop) { /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ const u_int8_t *op; u_int8_t _opt[60 - sizeof(struct tcphdr)]; unsigned int i; pr_debug("finding option\n"); if (!optlen) return invert; /* If we don't have the whole header, drop packet. */ op = skb_header_pointer(skb, protoff + sizeof(struct tcphdr), optlen, _opt); if (op == NULL) { *hotdrop = true; return false; } for (i = 0; i < optlen; ) { if (op[i] == option) return !invert; if (op[i] < 2) i++; else i += op[i+1]?:1; } return invert; } static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct tcphdr *th; struct tcphdr _tcph; const struct xt_tcp *tcpinfo = par->matchinfo; if (par->fragoff != 0) { /* To quote Alan: Don't allow a fragment of TCP 8 bytes in. Nobody normal causes this. Its a cracker trying to break in by doing a flag overwrite to pass the direction checks. */ if (par->fragoff == 1) { pr_debug("Dropping evil TCP offset=1 frag.\n"); par->hotdrop = true; } /* Must not be a fragment. */ return false; } th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ pr_debug("Dropping evil TCP offset=0 tinygram.\n"); par->hotdrop = true; return false; } if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], ntohs(th->source), !!(tcpinfo->invflags & XT_TCP_INV_SRCPT))) return false; if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], ntohs(th->dest), !!(tcpinfo->invflags & XT_TCP_INV_DSTPT))) return false; if (!NF_INVF(tcpinfo, XT_TCP_INV_FLAGS, (((unsigned char *)th)[13] & tcpinfo->flg_mask) == tcpinfo->flg_cmp)) return false; if (tcpinfo->option) { if (th->doff * 4 < sizeof(_tcph)) { par->hotdrop = true; return false; } if (!tcp_find_option(tcpinfo->option, skb, par->thoff, th->doff*4 - sizeof(_tcph), tcpinfo->invflags & XT_TCP_INV_OPTION, &par->hotdrop)) return false; } return true; } static int tcp_mt_check(const struct xt_mtchk_param *par) { const struct xt_tcp *tcpinfo = par->matchinfo; /* Must specify no unknown invflags */ return (tcpinfo->invflags & ~XT_TCP_INV_MASK) ? -EINVAL : 0; } static bool udp_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct udphdr *uh; struct udphdr _udph; const struct xt_udp *udpinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; uh = skb_header_pointer(skb, par->thoff, sizeof(_udph), &_udph); if (uh == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ pr_debug("Dropping evil UDP tinygram.\n"); par->hotdrop = true; return false; } return port_match(udpinfo->spts[0], udpinfo->spts[1], ntohs(uh->source), !!(udpinfo->invflags & XT_UDP_INV_SRCPT)) && port_match(udpinfo->dpts[0], udpinfo->dpts[1], ntohs(uh->dest), !!(udpinfo->invflags & XT_UDP_INV_DSTPT)); } static int udp_mt_check(const struct xt_mtchk_param *par) { const struct xt_udp *udpinfo = par->matchinfo; /* Must specify no unknown invflags */ return (udpinfo->invflags & ~XT_UDP_INV_MASK) ? -EINVAL : 0; } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ static bool type_code_in_range(u8 test_type, u8 min_code, u8 max_code, u8 type, u8 code) { return type == test_type && code >= min_code && code <= max_code; } static bool icmp_type_code_match(u8 test_type, u8 min_code, u8 max_code, u8 type, u8 code, bool invert) { return (test_type == 0xFF || type_code_in_range(test_type, min_code, max_code, type, code)) ^ invert; } static bool icmp6_type_code_match(u8 test_type, u8 min_code, u8 max_code, u8 type, u8 code, bool invert) { return type_code_in_range(test_type, min_code, max_code, type, code) ^ invert; } static bool icmp_match(const struct sk_buff *skb, struct xt_action_param *par) { const struct icmphdr *ic; struct icmphdr _icmph; const struct ipt_icmp *icmpinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); if (!ic) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ par->hotdrop = true; return false; } return icmp_type_code_match(icmpinfo->type, icmpinfo->code[0], icmpinfo->code[1], ic->type, ic->code, !!(icmpinfo->invflags & IPT_ICMP_INV)); } static bool icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) { const struct icmp6hdr *ic; struct icmp6hdr _icmph; const struct ip6t_icmp *icmpinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); if (!ic) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ par->hotdrop = true; return false; } return icmp6_type_code_match(icmpinfo->type, icmpinfo->code[0], icmpinfo->code[1], ic->icmp6_type, ic->icmp6_code, !!(icmpinfo->invflags & IP6T_ICMP_INV)); } static int icmp_checkentry(const struct xt_mtchk_param *par) { const struct ipt_icmp *icmpinfo = par->matchinfo; return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0; } static int icmp6_checkentry(const struct xt_mtchk_param *par) { const struct ip6t_icmp *icmpinfo = par->matchinfo; return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0; } static struct xt_match tcpudp_mt_reg[] __read_mostly = { { .name = "tcp", .family = NFPROTO_IPV4, .checkentry = tcp_mt_check, .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, { .name = "tcp", .family = NFPROTO_IPV6, .checkentry = tcp_mt_check, .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, { .name = "udp", .family = NFPROTO_IPV4, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDP, .me = THIS_MODULE, }, { .name = "udp", .family = NFPROTO_IPV6, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDP, .me = THIS_MODULE, }, { .name = "udplite", .family = NFPROTO_IPV4, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDPLITE, .me = THIS_MODULE, }, { .name = "udplite", .family = NFPROTO_IPV6, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDPLITE, .me = THIS_MODULE, }, { .name = "icmp", .match = icmp_match, .matchsize = sizeof(struct ipt_icmp), .checkentry = icmp_checkentry, .proto = IPPROTO_ICMP, .family = NFPROTO_IPV4, .me = THIS_MODULE, }, { .name = "icmp6", .match = icmp6_match, .matchsize = sizeof(struct ip6t_icmp), .checkentry = icmp6_checkentry, .proto = IPPROTO_ICMPV6, .family = NFPROTO_IPV6, .me = THIS_MODULE, }, }; static int __init tcpudp_mt_init(void) { return xt_register_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); } static void __exit tcpudp_mt_exit(void) { xt_unregister_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); } module_init(tcpudp_mt_init); module_exit(tcpudp_mt_exit); |
36545 3880 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H #include <asm/processor.h> #if defined(__KERNEL__) && !defined(__ASSEMBLY__) #include <asm/asm.h> #include <linux/bitops.h> #include <asm/alternative.h> enum cpuid_leafs { CPUID_1_EDX = 0, CPUID_8000_0001_EDX, CPUID_8086_0001_EDX, CPUID_LNX_1, CPUID_1_ECX, CPUID_C000_0001_EDX, CPUID_8000_0001_ECX, CPUID_LNX_2, CPUID_LNX_3, CPUID_7_0_EBX, CPUID_D_1_EAX, CPUID_LNX_4, CPUID_7_1_EAX, CPUID_8000_0008_EBX, CPUID_6_EAX, CPUID_8000_000A_EDX, CPUID_7_ECX, CPUID_8000_0007_EBX, CPUID_7_EDX, CPUID_8000_001F_EAX, CPUID_8000_0021_EAX, CPUID_LNX_5, NR_CPUID_WORDS, }; #define X86_CAP_FMT_NUM "%d:%d" #define x86_cap_flag_num(flag) ((flag) >> 5), ((flag) & 31) extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; #define X86_CAP_FMT "%s" #define x86_cap_flag(flag) x86_cap_flags[flag] /* * In order to save room, we index into this array by doing * X86_BUG_<name> - NCAPINTS*32. */ extern const char * const x86_bug_flags[NBUGINTS*32]; #define test_cpu_cap(c, bit) \ arch_test_bit(bit, (unsigned long *)((c)->x86_capability)) /* * There are 32 bits/features in each mask word. The high bits * (selected with (bit>>5) give us the word number and the low 5 * bits give us the bit/feature number inside the word. * (1UL<<((bit)&31) gives us a mask for the feature_bit so we can * see if it is set in the mask word. */ #define CHECK_BIT_IN_MASK_WORD(maskname, word, bit) \ (((bit)>>5)==(word) && (1UL<<((bit)&31) & maskname##word )) /* * {REQUIRED,DISABLED}_MASK_CHECK below may seem duplicated with the * following BUILD_BUG_ON_ZERO() check but when NCAPINTS gets changed, all * header macros which use NCAPINTS need to be changed. The duplicated macro * use causes the compiler to issue errors for all headers so that all usage * sites can be corrected. */ #define REQUIRED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 0, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 1, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 2, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 3, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 4, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 5, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 6, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 7, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 8, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 9, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 10, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 11, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 12, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 13, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 14, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \ REQUIRED_MASK_CHECK || \ BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 1, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 2, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 3, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 4, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 5, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 6, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 7, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 8, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 9, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 10, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 11, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 12, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 13, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 14, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 15, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 16, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 17, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \ DISABLED_MASK_CHECK || \ BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) #define this_cpu_has(bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ x86_this_cpu_test_bit(bit, cpu_info.x86_capability)) /* * This is the default CPU features testing macro to use in code. * * It is for detection of features which need kernel infrastructure to be * used. It may *not* directly test the CPU itself. Use the cpu_has() family * if you want true runtime testing of CPU features, like in hypervisor code * where you are supporting a possible guest feature where host support for it * is not relevant. */ #define cpu_feature_enabled(bit) \ (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit)) #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) extern void setup_clear_cpu_cap(unsigned int bit); extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #define setup_force_cpu_cap(bit) do { \ \ if (!boot_cpu_has(bit)) \ WARN_ON(alternatives_patched); \ \ set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) /* * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know * that this is only used on a fallback path and will sometimes cause * it to manifest the address of boot_cpu_data in a register, fouling * the mainline (post-initialization) code. */ static __always_inline bool _static_cpu_has(u16 bit) { asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" " testb %[bitnum], %a[cap_byte]\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" ".popsection\n" : : [feature] "i" (bit), [bitnum] "i" (1 << (bit & 7)), [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3]) : : t_yes, t_no); t_yes: return true; t_no: return false; } #define static_cpu_has(bit) \ ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ boot_cpu_has(bit) : \ _static_cpu_has(bit) \ ) #define cpu_has_bug(c, bit) cpu_has(c, (bit)) #define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) #define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)) #define static_cpu_has_bug(bit) static_cpu_has((bit)) #define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) #define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit)) #define MAX_CPU_FEATURES (NCAPINTS * 32) #define cpu_have_feature boot_cpu_has #define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X" #define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ boot_cpu_data.x86_model #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ #endif /* _ASM_X86_CPUFEATURE_H */ |
2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _CRYPTO_XTS_H #define _CRYPTO_XTS_H #include <crypto/b128ops.h> #include <crypto/internal/skcipher.h> #include <linux/fips.h> #define XTS_BLOCK_SIZE 16 static inline int xts_verify_key(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { /* * key consists of keys of equal size concatenated, therefore * the length must be even. */ if (keylen % 2) return -EINVAL; /* * In FIPS mode only a combined key length of either 256 or * 512 bits is allowed, c.f. FIPS 140-3 IG C.I. */ if (fips_enabled && keylen != 32 && keylen != 64) return -EINVAL; /* * Ensure that the AES and tweak key are not identical when * in FIPS mode or the FORBID_WEAK_KEYS flag is set. */ if ((fips_enabled || (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS)) && !crypto_memneq(key, key + (keylen / 2), keylen / 2)) return -EINVAL; return 0; } #endif /* _CRYPTO_XTS_H */ |
21 21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | // SPDX-License-Identifier: GPL-2.0-only /* This is a module which is used to mark packets for tracing. */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_log.h> MODULE_DESCRIPTION("Xtables: packet flow tracing"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_TRACE"); MODULE_ALIAS("ip6t_TRACE"); static int trace_tg_check(const struct xt_tgchk_param *par) { return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); } static void trace_tg_destroy(const struct xt_tgdtor_param *par) { nf_logger_put(par->family, NF_LOG_TYPE_LOG); } static unsigned int trace_tg(struct sk_buff *skb, const struct xt_action_param *par) { skb->nf_trace = 1; return XT_CONTINUE; } static struct xt_target trace_tg_reg[] __read_mostly = { { .name = "TRACE", .revision = 0, .family = NFPROTO_IPV4, .table = "raw", .target = trace_tg, .checkentry = trace_tg_check, .destroy = trace_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TRACE", .revision = 0, .family = NFPROTO_IPV6, .table = "raw", .target = trace_tg, .checkentry = trace_tg_check, .destroy = trace_tg_destroy, .me = THIS_MODULE, }, #endif }; static int __init trace_tg_init(void) { return xt_register_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg)); } static void __exit trace_tg_exit(void) { xt_unregister_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg)); } module_init(trace_tg_init); module_exit(trace_tg_exit); MODULE_SOFTDEP("pre: nf_log_syslog"); |
12 43 3 39 24 21 1220 1 1215 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | // SPDX-License-Identifier: GPL-2.0 #ifndef IOU_RSRC_H #define IOU_RSRC_H #include <linux/lockdep.h> enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, }; struct io_rsrc_node { unsigned char type; int refs; u64 tag; union { unsigned long file_ptr; struct io_mapped_ubuf *buf; }; }; struct io_mapped_ubuf { u64 ubuf; unsigned int len; unsigned int nr_bvecs; unsigned int folio_shift; refcount_t refs; unsigned long acct_pages; struct bio_vec bvec[] __counted_by(nr_bvecs); }; struct io_imu_folio_data { /* Head folio can be partially included in the fixed buf */ unsigned int nr_pages_head; /* For non-head/tail folios, has to be fully included */ unsigned int nr_pages_mid; unsigned int folio_shift; unsigned int nr_folios; }; struct io_rsrc_node *io_rsrc_node_alloc(int type); void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len); int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags); int io_sqe_files_unregister(struct io_ring_ctx *ctx); int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags); int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type); int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, struct io_imu_folio_data *data); static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data, int index) { if (index < data->nr) return data->nodes[array_index_nospec(index, data->nr)]; return NULL; } static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { lockdep_assert_held(&ctx->uring_lock); if (node && !--node->refs) io_free_rsrc_node(ctx, node); } static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_data *data, int index) { struct io_rsrc_node *node = data->nodes[index]; if (!node) return false; io_put_rsrc_node(ctx, node); data->nodes[index] = NULL; return true; } static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) { if (req->file_node) { io_put_rsrc_node(req->ctx, req->file_node); req->file_node = NULL; } if (req->flags & REQ_F_BUF_NODE) { io_put_rsrc_node(req->ctx, req->buf_node); req->buf_node = NULL; } } static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node, struct io_rsrc_node *node) { node->refs++; *dst_node = node; } static inline void io_req_assign_buf_node(struct io_kiocb *req, struct io_rsrc_node *node) { io_req_assign_rsrc_node(&req->buf_node, node); req->flags |= REQ_F_BUF_NODE; } int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int __io_account_mem(struct user_struct *user, unsigned long nr_pages); static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) { atomic_long_sub(nr_pages, &user->locked_vm); } #endif |
10158 8374 2213 2214 2214 8338 150 8449 50 8435 8430 8431 3561 3562 3568 3549 35 8 99 3568 3568 3640 3388 3639 3645 3632 6898 6909 1 6925 6926 2391 1 6925 7 7 7 7 2 1 1 23 23 20 23 23 23 23 735 733 735 735 735 731 1815 1749 591 977 1 976 1467 1465 784 1445 949 930 1467 2920 2251 66 417 976 1467 1931 2922 6465 6472 6459 4815 132 4704 15 15 10 9 4 4 4 9 9 2 7 6 7 7 7 26 26 26 26 26 26 1817 1821 998 1730 1819 7 7 1818 15 1821 5 1817 16 1823 16 2 16 20 2 18 7 26 8 26 24 1818 1824 1824 380 1 3 2 46 14 390 391 390 391 165 232 4221 3510 4221 4216 3971 43 1 1981 1092 3857 2155 3712 2303 165 164 160 162 160 162 3 162 471 174 473 201 201 201 15 200 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/swap.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * This file contains the default values for the operation of the * Linux VM subsystem. Fine-tuning documentation can be found in * Documentation/admin-guide/sysctl/vm.rst. * Started 18.12.91 * Swap aging added 23.2.95, Stephen Tweedie. * Buffermem limits added 12.3.98, Rik van Riel. */ #include <linux/mm.h> #include <linux/sched.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/pagevec.h> #include <linux/init.h> #include <linux/export.h> #include <linux/mm_inline.h> #include <linux/percpu_counter.h> #include <linux/memremap.h> #include <linux/percpu.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> #include <linux/gfp.h> #include <linux/uio.h> #include <linux/hugetlb.h> #include <linux/page_idle.h> #include <linux/local_lock.h> #include <linux/buffer_head.h> #include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/pagemap.h> /* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; const int page_cluster_max = 31; struct cpu_fbatches { /* * The following folio batches are grouped together because they are protected * by disabling preemption (and interrupts remain enabled). */ local_lock_t lock; struct folio_batch lru_add; struct folio_batch lru_deactivate_file; struct folio_batch lru_deactivate; struct folio_batch lru_lazyfree; #ifdef CONFIG_SMP struct folio_batch lru_activate; #endif /* Protecting the following batches which require disabling interrupts */ local_lock_t lock_irq; struct folio_batch lru_move_tail; }; static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = { .lock = INIT_LOCAL_LOCK(lock), .lock_irq = INIT_LOCAL_LOCK(lock_irq), }; static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp, unsigned long *flagsp) { if (folio_test_lru(folio)) { folio_lruvec_relock_irqsave(folio, lruvecp, flagsp); lruvec_del_folio(*lruvecp, folio); __folio_clear_lru_flags(folio); } } /* * This path almost never happens for VM activity - pages are normally freed * in batches. But it gets used by networking - and for compound pages. */ static void page_cache_release(struct folio *folio) { struct lruvec *lruvec = NULL; unsigned long flags; __page_cache_release(folio, &lruvec, &flags); if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); } void __folio_put(struct folio *folio) { if (unlikely(folio_is_zone_device(folio))) { free_zone_device_folio(folio); return; } if (folio_test_hugetlb(folio)) { free_huge_folio(folio); return; } page_cache_release(folio); folio_unqueue_deferred_split(folio); mem_cgroup_uncharge(folio); free_frozen_pages(&folio->page, folio_order(folio)); } EXPORT_SYMBOL(__folio_put); typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); static void lru_add(struct lruvec *lruvec, struct folio *folio) { int was_unevictable = folio_test_clear_unevictable(folio); long nr_pages = folio_nr_pages(folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* * Is an smp_mb__after_atomic() still required here, before * folio_evictable() tests the mlocked flag, to rule out the possibility * of stranding an evictable folio on an unevictable LRU? I think * not, because __munlock_folio() only clears the mlocked flag * while the LRU lock is held. * * (That is not true of __page_cache_release(), and not necessarily * true of folios_put(): but those only clear the mlocked flag after * folio_put_testzero() has excluded any other users of the folio.) */ if (folio_evictable(folio)) { if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { folio_clear_active(folio); folio_set_unevictable(folio); /* * folio->mlock_count = !!folio_test_mlocked(folio)? * But that leaves __mlock_folio() in doubt whether another * actor has already counted the mlock or not. Err on the * safe side, underestimate, let page reclaim fix it, rather * than leaving a page on the unevictable LRU indefinitely. */ folio->mlock_count = 0; if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } lruvec_add_folio(lruvec, folio); trace_mm_lru_insertion(folio); } static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn) { int i; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; folio_lruvec_relock_irqsave(folio, &lruvec, &flags); move_fn(lruvec, folio); folio_set_lru(folio); } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); folios_put(fbatch); } static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch, struct folio *folio, move_fn_t move_fn, bool on_lru, bool disable_irq) { unsigned long flags; if (on_lru && !folio_test_clear_lru(folio)) return; folio_get(folio); if (disable_irq) local_lock_irqsave(&cpu_fbatches.lock_irq, flags); else local_lock(&cpu_fbatches.lock); if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) || lru_cache_disabled()) folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn); if (disable_irq) local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags); else local_unlock(&cpu_fbatches.lock); } #define folio_batch_add_and_move(folio, op, on_lru) \ __folio_batch_add_and_move( \ &cpu_fbatches.op, \ folio, \ op, \ on_lru, \ offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \ ) static void lru_move_tail(struct lruvec *lruvec, struct folio *folio) { if (folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, folio_nr_pages(folio)); } /* * Writeback is about to end against a folio which has been marked for * immediate reclaim. If it still appears to be reclaimable, move it * to the tail of the inactive list. * * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races. */ void folio_rotate_reclaimable(struct folio *folio) { if (folio_test_locked(folio) || folio_test_dirty(folio) || folio_test_unevictable(folio)) return; folio_batch_add_and_move(folio, lru_move_tail, true); } void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_io, unsigned int nr_rotated) { unsigned long cost; /* * Reflect the relative cost of incurring IO and spending CPU * time on rotations. This doesn't attempt to make a precise * comparison, it just says: if reloads are about comparable * between the LRU lists, or rotations are overwhelmingly * different between them, adjust scan balance for CPU work. */ cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; do { unsigned long lrusize; /* * Hold lruvec->lru_lock is safe here, since * 1) The pinned lruvec in reclaim, or * 2) From a pre-LRU page during refault (which also holds the * rcu lock, so would be safe even if the page was on the LRU * and could move simultaneously to a new lruvec). */ spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) lruvec->file_cost += cost; else lruvec->anon_cost += cost; /* * Decay previous events * * Because workloads change over time (and to avoid * overflow) we keep these statistics as a floating * average, which ends up weighing recent refaults * more than old ones. */ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) + lruvec_page_state(lruvec, NR_ACTIVE_ANON) + lruvec_page_state(lruvec, NR_INACTIVE_FILE) + lruvec_page_state(lruvec, NR_ACTIVE_FILE); if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) { lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } spin_unlock_irq(&lruvec->lru_lock); } while ((lruvec = parent_lruvec(lruvec))); } void lru_note_cost_refault(struct folio *folio) { lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), folio_nr_pages(folio), 0); } static void lru_activate(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (folio_test_active(folio) || folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_set_active(folio); lruvec_add_folio(lruvec, folio); trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } #ifdef CONFIG_SMP static void folio_activate_drain(int cpu) { struct folio_batch *fbatch = &per_cpu(cpu_fbatches.lru_activate, cpu); if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_activate); } void folio_activate(struct folio *folio) { if (folio_test_active(folio) || folio_test_unevictable(folio)) return; folio_batch_add_and_move(folio, lru_activate, true); } #else static inline void folio_activate_drain(int cpu) { } void folio_activate(struct folio *folio) { struct lruvec *lruvec; if (!folio_test_clear_lru(folio)) return; lruvec = folio_lruvec_lock_irq(folio); lru_activate(lruvec, folio); unlock_page_lruvec_irq(lruvec); folio_set_lru(folio); } #endif static void __lru_cache_activate_folio(struct folio *folio) { struct folio_batch *fbatch; int i; local_lock(&cpu_fbatches.lock); fbatch = this_cpu_ptr(&cpu_fbatches.lru_add); /* * Search backwards on the optimistic assumption that the folio being * activated has just been added to this batch. Note that only * the local batch is examined as a !LRU folio could be in the * process of being released, reclaimed, migrated or on a remote * batch that is currently being drained. Furthermore, marking * a remote batch's folio active potentially hits a race where * a folio is marked active just after it is added to the inactive * list causing accounting errors and BUG_ON checks to trigger. */ for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) { struct folio *batch_folio = fbatch->folios[i]; if (batch_folio == folio) { folio_set_active(folio); break; } } local_unlock(&cpu_fbatches.lock); } #ifdef CONFIG_LRU_GEN static void lru_gen_inc_refs(struct folio *folio) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags); if (folio_test_unevictable(folio)) return; /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); return; } do { if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) { if (!folio_test_workingset(folio)) folio_set_workingset(folio); return; } new_flags = old_flags + BIT(LRU_REFS_PGOFF); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); } static bool lru_gen_clear_refs(struct folio *folio) { struct lru_gen_folio *lrugen; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); if (gen < 0) return true; set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0); lrugen = &folio_lruvec(folio)->lrugen; /* whether can do without shuffling under the LRU lock */ return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type])); } #else /* !CONFIG_LRU_GEN */ static void lru_gen_inc_refs(struct folio *folio) { } static bool lru_gen_clear_refs(struct folio *folio) { return false; } #endif /* CONFIG_LRU_GEN */ /** * folio_mark_accessed - Mark a folio as having seen activity. * @folio: The folio to mark. * * This function will perform one of the following transitions: * * * inactive,unreferenced -> inactive,referenced * * inactive,referenced -> active,unreferenced * * active,unreferenced -> active,referenced * * When a newly allocated folio is not yet visible, so safe for non-atomic ops, * __folio_set_referenced() may be substituted for folio_mark_accessed(). */ void folio_mark_accessed(struct folio *folio) { if (folio_test_dropbehind(folio)) return; if (lru_gen_enabled()) { lru_gen_inc_refs(folio); return; } if (!folio_test_referenced(folio)) { folio_set_referenced(folio); } else if (folio_test_unevictable(folio)) { /* * Unevictable pages are on the "LRU_UNEVICTABLE" list. But, * this list is never rotated or maintained, so marking an * unevictable page accessed has no effect. */ } else if (!folio_test_active(folio)) { /* * If the folio is on the LRU, queue it for activation via * cpu_fbatches.lru_activate. Otherwise, assume the folio is in a * folio_batch, mark it active and it'll be moved to the active * LRU on the next drain. */ if (folio_test_lru(folio)) folio_activate(folio); else __lru_cache_activate_folio(folio); folio_clear_referenced(folio); workingset_activation(folio); } if (folio_test_idle(folio)) folio_clear_idle(folio); } EXPORT_SYMBOL(folio_mark_accessed); /** * folio_add_lru - Add a folio to an LRU list. * @folio: The folio to be added to the LRU. * * Queue the folio for addition to the LRU. The decision on whether * to add the page to the [in]active [file|anon] list is deferred until the * folio_batch is drained. This gives a chance for the caller of folio_add_lru() * have the folio added to the active list using folio_mark_accessed(). */ void folio_add_lru(struct folio *folio) { VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); /* see the comment in lru_gen_folio_seq() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); folio_batch_add_and_move(folio, lru_add, false); } EXPORT_SYMBOL(folio_add_lru); /** * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. * @folio: The folio to be added to the LRU. * @vma: VMA in which the folio is mapped. * * If the VMA is mlocked, @folio is added to the unevictable list. * Otherwise, it is treated the same way as folio_add_lru(). */ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) { VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) mlock_new_folio(folio); else folio_add_lru(folio); } /* * If the folio cannot be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the * head of the list, rather than the tail, to give the flusher * threads some time to write it out, as this is much more * effective than the single-page writeout from reclaim. * * If the folio isn't mapped and dirty/writeback, the folio * could be reclaimed asap using the reclaim flag. * * 1. active, mapped folio -> none * 2. active, dirty/writeback folio -> inactive, head, reclaim * 3. inactive, mapped folio -> none * 4. inactive, dirty/writeback folio -> inactive, head, reclaim * 5. inactive, clean -> inactive, tail * 6. Others -> none * * In 4, it moves to the head of the inactive list so the folio is * written out by flusher threads as this is much more efficient * than the single-page writeout from reclaim. */ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) { bool active = folio_test_active(folio) || lru_gen_enabled(); long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio)) return; /* Some processes are using the folio */ if (folio_mapped(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); if (folio_test_writeback(folio) || folio_test_dirty(folio)) { /* * Setting the reclaim flag could race with * folio_end_writeback() and confuse readahead. But the * race window is _really_ small and it's not a critical * problem. */ lruvec_add_folio(lruvec, folio); folio_set_reclaim(folio); } else { /* * The folio's writeback ended while it was in the batch. * We move that folio to the tail of the inactive list. */ lruvec_add_folio_tail(lruvec, folio); __count_vm_events(PGROTATED, nr_pages); } if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } static void lru_deactivate(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); folio_clear_referenced(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) { long nr_pages = folio_nr_pages(folio); if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; lruvec_del_folio(lruvec, folio); folio_clear_active(folio); if (lru_gen_enabled()) lru_gen_clear_refs(folio); else folio_clear_referenced(folio); /* * Lazyfree folios are clean anonymous folios. They have * the swapbacked flag cleared, to distinguish them from normal * anonymous folios */ folio_clear_swapbacked(folio); lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } /* * Drain pages out of the cpu's folio_batch. * Either "cpu" is the current CPU, and preemption has already been * disabled; or "cpu" is being hot-unplugged, and is already dead. */ void lru_add_drain_cpu(int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); struct folio_batch *fbatch = &fbatches->lru_add; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_add); fbatch = &fbatches->lru_move_tail; /* Disabling interrupts below acts as a compiler barrier. */ if (data_race(folio_batch_count(fbatch))) { unsigned long flags; /* No harm done if a racing interrupt already did this */ local_lock_irqsave(&cpu_fbatches.lock_irq, flags); folio_batch_move_lru(fbatch, lru_move_tail); local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags); } fbatch = &fbatches->lru_deactivate_file; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate_file); fbatch = &fbatches->lru_deactivate; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_deactivate); fbatch = &fbatches->lru_lazyfree; if (folio_batch_count(fbatch)) folio_batch_move_lru(fbatch, lru_lazyfree); folio_activate_drain(cpu); } /** * deactivate_file_folio() - Deactivate a file folio. * @folio: Folio to deactivate. * * This function hints to the VM that @folio is a good reclaim candidate, * for example if its invalidation fails due to the folio being dirty * or under writeback. * * Context: Caller holds a reference on the folio. */ void deactivate_file_folio(struct folio *folio) { /* Deactivating an unevictable folio will not accelerate reclaim */ if (folio_test_unevictable(folio)) return; if (lru_gen_enabled() && lru_gen_clear_refs(folio)) return; folio_batch_add_and_move(folio, lru_deactivate_file, true); } /* * folio_deactivate - deactivate a folio * @folio: folio to deactivate * * folio_deactivate() moves @folio to the inactive list if @folio was on the * active list and was not unevictable. This is done to accelerate the * reclaim of @folio. */ void folio_deactivate(struct folio *folio) { if (folio_test_unevictable(folio)) return; if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; folio_batch_add_and_move(folio, lru_deactivate, true); } /** * folio_mark_lazyfree - make an anon folio lazyfree * @folio: folio to deactivate * * folio_mark_lazyfree() moves @folio to the inactive file list. * This is done to accelerate the reclaim of @folio. */ void folio_mark_lazyfree(struct folio *folio) { if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) || folio_test_swapcache(folio) || folio_test_unevictable(folio)) return; folio_batch_add_and_move(folio, lru_lazyfree, true); } void lru_add_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } /* * It's called from per-cpu workqueue context in SMP case so * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on * the same cpu. It shouldn't be a problem in !SMP case since * the core is only one and the locks will disable preemption. */ static void lru_add_and_bh_lrus_drain(void) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); local_unlock(&cpu_fbatches.lock); invalidate_bh_lrus_cpu(); mlock_drain_local(); } void lru_add_drain_cpu_zone(struct zone *zone) { local_lock(&cpu_fbatches.lock); lru_add_drain_cpu(smp_processor_id()); drain_local_pages(zone); local_unlock(&cpu_fbatches.lock); mlock_drain_local(); } #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_and_bh_lrus_drain(); } static bool cpu_needs_drain(unsigned int cpu) { struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu); /* Check these in order of likelihood that they're not zero */ return folio_batch_count(&fbatches->lru_add) || folio_batch_count(&fbatches->lru_move_tail) || folio_batch_count(&fbatches->lru_deactivate_file) || folio_batch_count(&fbatches->lru_deactivate) || folio_batch_count(&fbatches->lru_lazyfree) || folio_batch_count(&fbatches->lru_activate) || need_mlock_drain(cpu) || has_bh_in_lru(cpu, NULL); } /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is * executed on the offlined cpu. * Calling this function with cpu hotplug locks held can actually lead * to obscure indirect dependencies via WQ context. */ static inline void __lru_add_drain_all(bool force_all_cpus) { /* * lru_drain_gen - Global pages generation number * * (A) Definition: global lru_drain_gen = x implies that all generations * 0 < n <= x are already *scheduled* for draining. * * This is an optimization for the highly-contended use case where a * user space workload keeps constantly generating a flow of pages for * each CPU. */ static unsigned int lru_drain_gen; static struct cpumask has_work; static DEFINE_MUTEX(lock); unsigned cpu, this_gen; /* * Make sure nobody triggers this path before mm_percpu_wq is fully * initialized. */ if (WARN_ON(!mm_percpu_wq)) return; /* * Guarantee folio_batch counter stores visible by this CPU * are visible to other CPUs before loading the current drain * generation. */ smp_mb(); /* * (B) Locally cache global LRU draining generation number * * The read barrier ensures that the counter is loaded before the mutex * is taken. It pairs with smp_mb() inside the mutex critical section * at (D). */ this_gen = smp_load_acquire(&lru_drain_gen); mutex_lock(&lock); /* * (C) Exit the draining operation if a newer generation, from another * lru_add_drain_all(), was already scheduled for draining. Check (A). */ if (unlikely(this_gen != lru_drain_gen && !force_all_cpus)) goto done; /* * (D) Increment global generation number * * Pairs with smp_load_acquire() at (B), outside of the critical * section. Use a full memory barrier to guarantee that the * new global drain generation number is stored before loading * folio_batch counters. * * This pairing must be done here, before the for_each_online_cpu loop * below which drains the page vectors. * * Let x, y, and z represent some system CPU numbers, where x < y < z. * Assume CPU #z is in the middle of the for_each_online_cpu loop * below and has already reached CPU #y's per-cpu data. CPU #x comes * along, adds some pages to its per-cpu vectors, then calls * lru_add_drain_all(). * * If the paired barrier is done at any later step, e.g. after the * loop, CPU #x will just exit at (C) and miss flushing out all of its * added pages. */ WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1); smp_mb(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { struct work_struct *work = &per_cpu(lru_add_drain_work, cpu); if (cpu_needs_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); __cpumask_set_cpu(cpu, &has_work); } } for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); done: mutex_unlock(&lock); } void lru_add_drain_all(void) { __lru_add_drain_all(false); } #else void lru_add_drain_all(void) { lru_add_drain(); } #endif /* CONFIG_SMP */ atomic_t lru_disable_count = ATOMIC_INIT(0); /* * lru_cache_disable() needs to be called before we start compiling * a list of folios to be migrated using folio_isolate_lru(). * It drains folios on LRU cache and then disable on all cpus until * lru_cache_enable is called. * * Must be paired with a call to lru_cache_enable(). */ void lru_cache_disable(void) { atomic_inc(&lru_disable_count); /* * Readers of lru_disable_count are protected by either disabling * preemption or rcu_read_lock: * * preempt_disable, local_irq_disable [bh_lru_lock()] * rcu_read_lock [rt_spin_lock CONFIG_PREEMPT_RT] * preempt_disable [local_lock !CONFIG_PREEMPT_RT] * * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on * preempt_disable() regions of code. So any CPU which sees * lru_disable_count = 0 will have exited the critical * section when synchronize_rcu() returns. */ synchronize_rcu_expedited(); #ifdef CONFIG_SMP __lru_add_drain_all(true); #else lru_add_and_bh_lrus_drain(); #endif } /** * folios_put_refs - Reduce the reference count on a batch of folios. * @folios: The folios. * @refs: The number of refs to subtract from each folio. * * Like folio_put(), but for a batch of folios. This is more efficient * than writing the loop yourself as it will optimise the locks which need * to be taken if the folios are freed. The folios batch is returned * empty and ready to be reused for another batch; there is no need * to reinitialise it. If @refs is NULL, we subtract one from each * folio refcount. * * Context: May be called in process or interrupt context, but not in NMI * context. May be called while holding a spinlock. */ void folios_put_refs(struct folio_batch *folios, unsigned int *refs) { int i, j; struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0, j = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; unsigned int nr_refs = refs ? refs[i] : 1; if (is_huge_zero_folio(folio)) continue; if (folio_is_zone_device(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } if (put_devmap_managed_folio_refs(folio, nr_refs)) continue; if (folio_ref_sub_and_test(folio, nr_refs)) free_zone_device_folio(folio); continue; } if (!folio_ref_sub_and_test(folio, nr_refs)) continue; /* hugetlb has its own memcg */ if (folio_test_hugetlb(folio)) { if (lruvec) { unlock_page_lruvec_irqrestore(lruvec, flags); lruvec = NULL; } free_huge_folio(folio); continue; } folio_unqueue_deferred_split(folio); __page_cache_release(folio, &lruvec, &flags); if (j != i) folios->folios[j] = folio; j++; } if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); if (!j) { folio_batch_reinit(folios); return; } folios->nr = j; mem_cgroup_uncharge_folios(folios); free_unref_folios(folios); } EXPORT_SYMBOL(folios_put_refs); /** * release_pages - batched put_page() * @arg: array of pages to release * @nr: number of pages * * Decrement the reference count on all the pages in @arg. If it * fell to zero, remove the page from the LRU and free it. * * Note that the argument can be an array of pages, encoded pages, * or folio pointers. We ignore any encoded bits, and turn any of * them into just a folio that gets free'd. */ void release_pages(release_pages_arg arg, int nr) { struct folio_batch fbatch; int refs[PAGEVEC_SIZE]; struct encoded_page **encoded = arg.encoded_pages; int i; folio_batch_init(&fbatch); for (i = 0; i < nr; i++) { /* Turn any of the argument types into a folio */ struct folio *folio = page_folio(encoded_page_ptr(encoded[i])); /* Is our next entry actually "nr_pages" -> "nr_refs" ? */ refs[fbatch.nr] = 1; if (unlikely(encoded_page_flags(encoded[i]) & ENCODED_PAGE_BIT_NR_PAGES_NEXT)) refs[fbatch.nr] = encoded_nr_pages(encoded[++i]); if (folio_batch_add(&fbatch, folio) > 0) continue; folios_put_refs(&fbatch, refs); } if (fbatch.nr) folios_put_refs(&fbatch, refs); } EXPORT_SYMBOL(release_pages); /* * The folios which we're about to release may be in the deferred lru-addition * queues. That would prevent them from really being freed right now. That's * OK from a correctness point of view but is inefficient - those folios may be * cache-warm and we want to give them back to the page allocator ASAP. * * So __folio_batch_release() will drain those queues here. * folio_batch_move_lru() calls folios_put() directly to avoid * mutual recursion. */ void __folio_batch_release(struct folio_batch *fbatch) { if (!fbatch->percpu_pvec_drained) { lru_add_drain(); fbatch->percpu_pvec_drained = true; } folios_put(fbatch); } EXPORT_SYMBOL(__folio_batch_release); /** * folio_batch_remove_exceptionals() - Prune non-folios from a batch. * @fbatch: The batch to prune * * find_get_entries() fills a batch with both folios and shadow/swap/DAX * entries. This function prunes all the non-folio entries from @fbatch * without leaving holes, so that it can be passed on to folio-only batch * operations. */ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) { unsigned int i, j; for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) { struct folio *folio = fbatch->folios[i]; if (!xa_is_value(folio)) fbatch->folios[j++] = folio; } fbatch->nr = j; } /* * Perform any setup for the swap system */ void __init swap_setup(void) { unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ if (megs < 16) page_cluster = 2; else page_cluster = 3; /* * Right now other parts of the system means that we * _really_ don't want to cluster much more */ } |
268 2 264 4 268 267 267 266 116 100 6 5 6 6 117 1 39 78 27 37 63 63 100 117 42 75 1 117 317 32 150 32 238 98 98 4 99 3 34 65 32 67 93 6 28 15 7 63 2 12 1 64 46 28 3 26 29 16 60 64 21 15 10 71 5 9 1 13 3 36 14 3 27 31 31 11 18 1 1 30 10 17 5 16 10 6 6 6 96 96 95 1 50 1 43 17 32 3 9 6 3 5 4 13 12 33 68 94 26 68 68 68 3 8 68 68 68 21 16 4 2 15 2 3 33 86 22 12 51 2 11 10 5 77 24 3 89 6 23 67 67 66 65 66 5 8 53 2 20 68 68 63 67 67 1 65 14 2 10 11 67 1 11 11 6 66 4 79 78 74 42 35 2 2 3 30 30 76 30 4 27 76 175 5 169 171 107 107 223 223 101 122 165 10 7 143 78 97 68 68 90 1 78 11 231 1 85 146 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 | // SPDX-License-Identifier: GPL-2.0-or-later /* * inet_diag.c Module for monitoring INET transport protocols sockets. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/time.h> #include <net/icmp.h> #include <net/tcp.h> #include <net/ipv6.h> #include <net/inet_common.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/inet6_hashtables.h> #include <net/bpf_sk_storage.h> #include <net/netlink.h> #include <linux/inet.h> #include <linux/stddef.h> #include <linux/inet_diag.h> #include <linux/sock_diag.h> static const struct inet_diag_handler __rcu **inet_diag_table; struct inet_diag_entry { const __be32 *saddr; const __be32 *daddr; u16 sport; u16 dport; u16 family; u16 userlocks; u32 ifindex; u32 mark; #ifdef CONFIG_SOCK_CGROUP_DATA u64 cgroup_id; #endif }; static const struct inet_diag_handler *inet_diag_lock_handler(int proto) { const struct inet_diag_handler *handler; if (proto < 0 || proto >= IPPROTO_MAX) return NULL; if (!READ_ONCE(inet_diag_table[proto])) sock_load_diag_module(AF_INET, proto); rcu_read_lock(); handler = rcu_dereference(inet_diag_table[proto]); if (handler && !try_module_get(handler->owner)) handler = NULL; rcu_read_unlock(); return handler; } static void inet_diag_unlock_handler(const struct inet_diag_handler *handler) { module_put(handler->owner); } void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) { r->idiag_family = sk->sk_family; r->id.idiag_sport = htons(sk->sk_num); r->id.idiag_dport = sk->sk_dport; r->id.idiag_if = sk->sk_bound_dev_if; sock_diag_save_cookie(sk, r->id.idiag_cookie); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr; *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr; } else #endif { memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); r->id.idiag_src[0] = sk->sk_rcv_saddr; r->id.idiag_dst[0] = sk->sk_daddr; } } EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill); static size_t inet_sk_attr_size(struct sock *sk, const struct inet_diag_req_v2 *req, bool net_admin) { const struct inet_diag_handler *handler; size_t aux = 0; rcu_read_lock(); handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]); DEBUG_NET_WARN_ON_ONCE(!handler); if (handler && handler->idiag_get_aux_size) aux = handler->idiag_get_aux_size(sk, net_admin); rcu_read_unlock(); return nla_total_size(sizeof(struct tcp_info)) + nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + nla_total_size(TCP_CA_NAME_MAX) + nla_total_size(sizeof(struct tcpvegas_info)) + aux + 64; } int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, struct user_namespace *user_ns, bool net_admin) { const struct inet_sock *inet = inet_sk(sk); struct inet_diag_sockopt inet_sockopt; if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) goto errout; /* IPv6 dual-stack sockets use inet->tos for IPv4 connections, * hence this needs to be included regardless of socket family. */ if (ext & (1 << (INET_DIAG_TOS - 1))) if (nla_put_u8(skb, INET_DIAG_TOS, READ_ONCE(inet->tos)) < 0) goto errout; #if IS_ENABLED(CONFIG_IPV6) if (r->idiag_family == AF_INET6) { if (ext & (1 << (INET_DIAG_TCLASS - 1))) if (nla_put_u8(skb, INET_DIAG_TCLASS, inet6_sk(sk)->tclass) < 0) goto errout; if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk))) goto errout; } #endif if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, READ_ONCE(sk->sk_mark))) goto errout; if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) || ext & (1 << (INET_DIAG_TCLASS - 1))) { u32 classid = 0; #ifdef CONFIG_SOCK_CGROUP_DATA classid = sock_cgroup_classid(&sk->sk_cgrp_data); #endif /* Fallback to socket priority if class id isn't set. * Classful qdiscs use it as direct reference to class. * For cgroup2 classid is always zero. */ if (!classid) classid = READ_ONCE(sk->sk_priority); if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid)) goto errout; } #ifdef CONFIG_SOCK_CGROUP_DATA if (nla_put_u64_64bit(skb, INET_DIAG_CGROUP_ID, cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)), INET_DIAG_PAD)) goto errout; #endif r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); memset(&inet_sockopt, 0, sizeof(inet_sockopt)); inet_sockopt.recverr = inet_test_bit(RECVERR, sk); inet_sockopt.is_icsk = inet_test_bit(IS_ICSK, sk); inet_sockopt.freebind = inet_test_bit(FREEBIND, sk); inet_sockopt.hdrincl = inet_test_bit(HDRINCL, sk); inet_sockopt.mc_loop = inet_test_bit(MC_LOOP, sk); inet_sockopt.transparent = inet_test_bit(TRANSPARENT, sk); inet_sockopt.mc_all = inet_test_bit(MC_ALL, sk); inet_sockopt.nodefrag = inet_test_bit(NODEFRAG, sk); inet_sockopt.bind_address_no_port = inet_test_bit(BIND_ADDRESS_NO_PORT, sk); inet_sockopt.recverr_rfc4884 = inet_test_bit(RECVERR_RFC4884, sk); inet_sockopt.defer_connect = inet_test_bit(DEFER_CONNECT, sk); if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt), &inet_sockopt)) goto errout; return 0; errout: return 1; } EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); static int inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen, struct nlattr **req_nlas) { struct nlattr *nla; int remaining; nlmsg_for_each_attr(nla, nlh, hdrlen, remaining) { int type = nla_type(nla); if (type == INET_DIAG_REQ_PROTOCOL && nla_len(nla) != sizeof(u32)) return -EINVAL; if (type < __INET_DIAG_REQ_MAX) req_nlas[type] = nla; } return 0; } static int inet_diag_get_protocol(const struct inet_diag_req_v2 *req, const struct inet_diag_dump_data *data) { if (data->req_nlas[INET_DIAG_REQ_PROTOCOL]) return nla_get_u32(data->req_nlas[INET_DIAG_REQ_PROTOCOL]); return req->sdiag_protocol; } #define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, u16 nlmsg_flags, bool net_admin) { const struct tcp_congestion_ops *ca_ops; const struct inet_diag_handler *handler; struct inet_diag_dump_data *cb_data; int ext = req->idiag_ext; struct inet_diag_msg *r; struct nlmsghdr *nlh; struct nlattr *attr; void *info = NULL; u8 icsk_pending; int protocol; cb_data = cb->data; protocol = inet_diag_get_protocol(req, cb_data); /* inet_diag_lock_handler() made sure inet_diag_table[] is stable. */ handler = rcu_dereference_protected(inet_diag_table[protocol], 1); DEBUG_NET_WARN_ON_ONCE(!handler); if (!handler) return -ENXIO; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); BUG_ON(!sk_fullsock(sk)); inet_diag_msg_common_fill(r, sk); r->idiag_state = sk->sk_state; r->idiag_timer = 0; r->idiag_retrans = 0; r->idiag_expires = 0; if (inet_diag_msg_attrs_fill(sk, skb, r, ext, sk_user_ns(NETLINK_CB(cb->skb).sk), net_admin)) goto errout; if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), .idiag_fmem = sk_forward_alloc_get(sk), .idiag_tmem = sk_wmem_alloc_get(sk), }; if (nla_put(skb, INET_DIAG_MEMINFO, sizeof(minfo), &minfo) < 0) goto errout; } if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO)) goto errout; /* * RAW sockets might have user-defined protocols assigned, * so report the one supplied on socket creation. */ if (sk->sk_type == SOCK_RAW) { if (nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol)) goto errout; } if (!icsk) { handler->idiag_get_info(sk, r, NULL); goto out; } icsk_pending = smp_load_acquire(&icsk->icsk_pending); if (icsk_pending == ICSK_TIME_RETRANS || icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); } else if (icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); } else if (timer_pending(&sk->sk_timer)) { r->idiag_timer = 2; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies); } if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) { attr = nla_reserve_64bit(skb, INET_DIAG_INFO, handler->idiag_info_size, INET_DIAG_PAD); if (!attr) goto errout; info = nla_data(attr); } if (ext & (1 << (INET_DIAG_CONG - 1))) { int err = 0; rcu_read_lock(); ca_ops = READ_ONCE(icsk->icsk_ca_ops); if (ca_ops) err = nla_put_string(skb, INET_DIAG_CONG, ca_ops->name); rcu_read_unlock(); if (err < 0) goto errout; } handler->idiag_get_info(sk, r, info); if (ext & (1 << (INET_DIAG_INFO - 1)) && handler->idiag_get_aux) if (handler->idiag_get_aux(sk, net_admin, skb) < 0) goto errout; if (sk->sk_state < TCP_TIME_WAIT) { union tcp_cc_info info; size_t sz = 0; int attr; rcu_read_lock(); ca_ops = READ_ONCE(icsk->icsk_ca_ops); if (ca_ops && ca_ops->get_info) sz = ca_ops->get_info(sk, ext, &attr, &info); rcu_read_unlock(); if (sz && nla_put(skb, attr, sz, &info) < 0) goto errout; } /* Keep it at the end for potential retry with a larger skb, * or else do best-effort fitting, which is only done for the * first_nlmsg. */ if (cb_data->bpf_stg_diag) { bool first_nlmsg = ((unsigned char *)nlh == skb->data); unsigned int prev_min_dump_alloc; unsigned int total_nla_size = 0; unsigned int msg_len; int err; msg_len = skb_tail_pointer(skb) - (unsigned char *)nlh; err = bpf_sk_storage_diag_put(cb_data->bpf_stg_diag, sk, skb, INET_DIAG_SK_BPF_STORAGES, &total_nla_size); if (!err) goto out; total_nla_size += msg_len; prev_min_dump_alloc = cb->min_dump_alloc; if (total_nla_size > prev_min_dump_alloc) cb->min_dump_alloc = min_t(u32, total_nla_size, MAX_DUMP_ALLOC_SIZE); if (!first_nlmsg) goto errout; if (cb->min_dump_alloc > prev_min_dump_alloc) /* Retry with pskb_expand_head() with * __GFP_DIRECT_RECLAIM */ goto errout; WARN_ON_ONCE(total_nla_size <= prev_min_dump_alloc); /* Send what we have for this sk * and move on to the next sk in the following * dump() */ } out: nlmsg_end(skb, nlh); return 0; errout: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } EXPORT_SYMBOL_GPL(inet_sk_diag_fill); static int inet_twsk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, u16 nlmsg_flags, bool net_admin) { struct inet_timewait_sock *tw = inet_twsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); BUG_ON(tw->tw_state != TCP_TIME_WAIT); inet_diag_msg_common_fill(r, sk); r->idiag_retrans = 0; r->idiag_state = READ_ONCE(tw->tw_substate); r->idiag_timer = 3; tmo = tw->tw_timer.expires - jiffies; r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; r->idiag_inode = 0; if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, tw->tw_mark)) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; } nlmsg_end(skb, nlh); return 0; } static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, u16 nlmsg_flags, bool net_admin) { struct request_sock *reqsk = inet_reqsk(sk); struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); inet_diag_msg_common_fill(r, sk); r->idiag_state = TCP_SYN_RECV; r->idiag_timer = 1; r->idiag_retrans = reqsk->num_retrans; BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != offsetof(struct sock, sk_cookie)); tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; r->idiag_expires = jiffies_delta_to_msecs(tmo); r->idiag_rqueue = 0; r->idiag_wqueue = 0; r->idiag_uid = 0; r->idiag_inode = 0; if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, inet_rsk(reqsk)->ir_mark)) { nlmsg_cancel(skb, nlh); return -EMSGSIZE; } nlmsg_end(skb, nlh); return 0; } static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, u16 nlmsg_flags, bool net_admin) { if (sk->sk_state == TCP_TIME_WAIT) return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); if (sk->sk_state == TCP_NEW_SYN_RECV) return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags, net_admin); } struct sock *inet_diag_find_one_icsk(struct net *net, struct inet_hashinfo *hashinfo, const struct inet_diag_req_v2 *req) { struct sock *sk; rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if); else sk = inet6_lookup(net, hashinfo, NULL, 0, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, req->id.idiag_if); } #endif else { rcu_read_unlock(); return ERR_PTR(-EINVAL); } rcu_read_unlock(); if (!sk) return ERR_PTR(-ENOENT); if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { sock_gen_put(sk); return ERR_PTR(-ENOENT); } return sk; } EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk); int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); struct net *net = sock_net(in_skb->sk); struct sk_buff *rep; struct sock *sk; int err; sk = inet_diag_find_one_icsk(net, hashinfo, req); if (IS_ERR(sk)) return PTR_ERR(sk); rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL); if (!rep) { err = -ENOMEM; goto out; } err = sk_diag_fill(sk, rep, cb, req, 0, net_admin); if (err < 0) { WARN_ON(err == -EMSGSIZE); nlmsg_free(rep); goto out; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); out: if (sk) sock_gen_put(sk); return err; } EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, int hdrlen, const struct inet_diag_req_v2 *req) { const struct inet_diag_handler *handler; struct inet_diag_dump_data dump_data; int err, protocol; memset(&dump_data, 0, sizeof(dump_data)); err = inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas); if (err) return err; protocol = inet_diag_get_protocol(req, &dump_data); handler = inet_diag_lock_handler(protocol); if (!handler) return -ENOENT; if (cmd == SOCK_DIAG_BY_FAMILY) { struct netlink_callback cb = { .nlh = nlh, .skb = in_skb, .data = &dump_data, }; err = handler->dump_one(&cb, req); } else if (cmd == SOCK_DESTROY && handler->destroy) { err = handler->destroy(in_skb, req); } else { err = -EOPNOTSUPP; } inet_diag_unlock_handler(handler); return err; } static int bitstring_match(const __be32 *a1, const __be32 *a2, int bits) { int words = bits >> 5; bits &= 0x1f; if (words) { if (memcmp(a1, a2, words << 2)) return 0; } if (bits) { __be32 w1, w2; __be32 mask; w1 = a1[words]; w2 = a2[words]; mask = htonl((0xffffffff) << (32 - bits)); if ((w1 ^ w2) & mask) return 0; } return 1; } static int inet_diag_bc_run(const struct nlattr *_bc, const struct inet_diag_entry *entry) { const void *bc = nla_data(_bc); int len = nla_len(_bc); while (len > 0) { int yes = 1; const struct inet_diag_bc_op *op = bc; switch (op->code) { case INET_DIAG_BC_NOP: break; case INET_DIAG_BC_JMP: yes = 0; break; case INET_DIAG_BC_S_EQ: yes = entry->sport == op[1].no; break; case INET_DIAG_BC_S_GE: yes = entry->sport >= op[1].no; break; case INET_DIAG_BC_S_LE: yes = entry->sport <= op[1].no; break; case INET_DIAG_BC_D_EQ: yes = entry->dport == op[1].no; break; case INET_DIAG_BC_D_GE: yes = entry->dport >= op[1].no; break; case INET_DIAG_BC_D_LE: yes = entry->dport <= op[1].no; break; case INET_DIAG_BC_AUTO: yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); break; case INET_DIAG_BC_S_COND: case INET_DIAG_BC_D_COND: { const struct inet_diag_hostcond *cond; const __be32 *addr; cond = (const struct inet_diag_hostcond *)(op + 1); if (cond->port != -1 && cond->port != (op->code == INET_DIAG_BC_S_COND ? entry->sport : entry->dport)) { yes = 0; break; } if (op->code == INET_DIAG_BC_S_COND) addr = entry->saddr; else addr = entry->daddr; if (cond->family != AF_UNSPEC && cond->family != entry->family) { if (entry->family == AF_INET6 && cond->family == AF_INET) { if (addr[0] == 0 && addr[1] == 0 && addr[2] == htonl(0xffff) && bitstring_match(addr + 3, cond->addr, cond->prefix_len)) break; } yes = 0; break; } if (cond->prefix_len == 0) break; if (bitstring_match(addr, cond->addr, cond->prefix_len)) break; yes = 0; break; } case INET_DIAG_BC_DEV_COND: { u32 ifindex; ifindex = *((const u32 *)(op + 1)); if (ifindex != entry->ifindex) yes = 0; break; } case INET_DIAG_BC_MARK_COND: { struct inet_diag_markcond *cond; cond = (struct inet_diag_markcond *)(op + 1); if ((entry->mark & cond->mask) != cond->mark) yes = 0; break; } #ifdef CONFIG_SOCK_CGROUP_DATA case INET_DIAG_BC_CGROUP_COND: { u64 cgroup_id; cgroup_id = get_unaligned((const u64 *)(op + 1)); if (cgroup_id != entry->cgroup_id) yes = 0; break; } #endif } if (yes) { len -= op->yes; bc += op->yes; } else { len -= op->no; bc += op->no; } } return len == 0; } /* This helper is available for all sockets (ESTABLISH, TIMEWAIT, SYN_RECV) */ static void entry_fill_addrs(struct inet_diag_entry *entry, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32; entry->daddr = sk->sk_v6_daddr.s6_addr32; } else #endif { entry->saddr = &sk->sk_rcv_saddr; entry->daddr = &sk->sk_daddr; } } int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct inet_diag_entry entry; if (!bc) return 1; entry.family = sk->sk_family; entry_fill_addrs(&entry, sk); entry.sport = inet->inet_num; entry.dport = ntohs(inet->inet_dport); entry.ifindex = sk->sk_bound_dev_if; entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; if (sk_fullsock(sk)) entry.mark = READ_ONCE(sk->sk_mark); else if (sk->sk_state == TCP_NEW_SYN_RECV) entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; else if (sk->sk_state == TCP_TIME_WAIT) entry.mark = inet_twsk(sk)->tw_mark; else entry.mark = 0; #ifdef CONFIG_SOCK_CGROUP_DATA entry.cgroup_id = sk_fullsock(sk) ? cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; #endif return inet_diag_bc_run(bc, &entry); } EXPORT_SYMBOL_GPL(inet_diag_bc_sk); static int valid_cc(const void *bc, int len, int cc) { while (len >= 0) { const struct inet_diag_bc_op *op = bc; if (cc > len) return 0; if (cc == len) return 1; if (op->yes < 4 || op->yes & 3) return 0; len -= op->yes; bc += op->yes; } return 0; } /* data is u32 ifindex */ static bool valid_devcond(const struct inet_diag_bc_op *op, int len, int *min_len) { /* Check ifindex space. */ *min_len += sizeof(u32); if (len < *min_len) return false; return true; } /* Validate an inet_diag_hostcond. */ static bool valid_hostcond(const struct inet_diag_bc_op *op, int len, int *min_len) { struct inet_diag_hostcond *cond; int addr_len; /* Check hostcond space. */ *min_len += sizeof(struct inet_diag_hostcond); if (len < *min_len) return false; cond = (struct inet_diag_hostcond *)(op + 1); /* Check address family and address length. */ switch (cond->family) { case AF_UNSPEC: addr_len = 0; break; case AF_INET: addr_len = sizeof(struct in_addr); break; case AF_INET6: addr_len = sizeof(struct in6_addr); break; default: return false; } *min_len += addr_len; if (len < *min_len) return false; /* Check prefix length (in bits) vs address length (in bytes). */ if (cond->prefix_len > 8 * addr_len) return false; return true; } /* Validate a port comparison operator. */ static bool valid_port_comparison(const struct inet_diag_bc_op *op, int len, int *min_len) { /* Port comparisons put the port in a follow-on inet_diag_bc_op. */ *min_len += sizeof(struct inet_diag_bc_op); if (len < *min_len) return false; return true; } static bool valid_markcond(const struct inet_diag_bc_op *op, int len, int *min_len) { *min_len += sizeof(struct inet_diag_markcond); return len >= *min_len; } #ifdef CONFIG_SOCK_CGROUP_DATA static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len, int *min_len) { *min_len += sizeof(u64); return len >= *min_len; } #endif static int inet_diag_bc_audit(const struct nlattr *attr, const struct sk_buff *skb) { bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); const void *bytecode, *bc; int bytecode_len, len; if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op)) return -EINVAL; bytecode = bc = nla_data(attr); len = bytecode_len = nla_len(attr); while (len > 0) { int min_len = sizeof(struct inet_diag_bc_op); const struct inet_diag_bc_op *op = bc; switch (op->code) { case INET_DIAG_BC_S_COND: case INET_DIAG_BC_D_COND: if (!valid_hostcond(bc, len, &min_len)) return -EINVAL; break; case INET_DIAG_BC_DEV_COND: if (!valid_devcond(bc, len, &min_len)) return -EINVAL; break; case INET_DIAG_BC_S_EQ: case INET_DIAG_BC_S_GE: case INET_DIAG_BC_S_LE: case INET_DIAG_BC_D_EQ: case INET_DIAG_BC_D_GE: case INET_DIAG_BC_D_LE: if (!valid_port_comparison(bc, len, &min_len)) return -EINVAL; break; case INET_DIAG_BC_MARK_COND: if (!net_admin) return -EPERM; if (!valid_markcond(bc, len, &min_len)) return -EINVAL; break; #ifdef CONFIG_SOCK_CGROUP_DATA case INET_DIAG_BC_CGROUP_COND: if (!valid_cgroupcond(bc, len, &min_len)) return -EINVAL; break; #endif case INET_DIAG_BC_AUTO: case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: break; default: return -EINVAL; } if (op->code != INET_DIAG_BC_NOP) { if (op->no < min_len || op->no > len + 4 || op->no & 3) return -EINVAL; if (op->no < len && !valid_cc(bytecode, bytecode_len, len - op->no)) return -EINVAL; } if (op->yes < min_len || op->yes > len + 4 || op->yes & 3) return -EINVAL; bc += op->yes; len -= op->yes; } return len == 0 ? 0 : -EINVAL; } static void twsk_build_assert(void) { BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != offsetof(struct sock, sk_family)); BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) != offsetof(struct inet_sock, inet_num)); BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) != offsetof(struct inet_sock, inet_dport)); BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) != offsetof(struct inet_sock, inet_rcv_saddr)); BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) != offsetof(struct inet_sock, inet_daddr)); #if IS_ENABLED(CONFIG_IPV6) BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) != offsetof(struct sock, sk_v6_rcv_saddr)); BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) != offsetof(struct sock, sk_v6_daddr)); #endif } void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct inet_diag_dump_data *cb_data = cb->data; struct net *net = sock_net(skb->sk); u32 idiag_states = r->idiag_states; int i, num, s_i, s_num; struct nlattr *bc; struct sock *sk; bc = cb_data->inet_diag_nla_bc; if (idiag_states & TCPF_SYN_RECV) idiag_states |= TCPF_NEW_SYN_RECV; s_i = cb->args[1]; s_num = num = cb->args[2]; if (cb->args[0] == 0) { if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) goto skip_listen_ht; for (i = s_i; i <= hashinfo->lhash2_mask; i++) { struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; num = 0; ilb = &hashinfo->lhash2[i]; if (hlist_nulls_empty(&ilb->nulls_head)) { s_num = 0; continue; } spin_lock(&ilb->lock); sk_nulls_for_each(sk, node, &ilb->nulls_head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) { num++; continue; } if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next_listen; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next_listen; if (!inet_diag_bc_sk(bc, sk)) goto next_listen; if (inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, NLM_F_MULTI, net_admin) < 0) { spin_unlock(&ilb->lock); goto done; } next_listen: ++num; } spin_unlock(&ilb->lock); s_num = 0; } skip_listen_ht: cb->args[0] = 1; s_i = num = s_num = 0; } /* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets * with bh disabled. */ #define SKARR_SZ 16 /* Dump bound but inactive (not listening, connecting, etc.) sockets */ if (cb->args[0] == 1) { if (!(idiag_states & TCPF_BOUND_INACTIVE)) goto skip_bind_ht; for (i = s_i; i < hashinfo->bhash_size; i++) { struct inet_bind_hashbucket *ibb; struct inet_bind2_bucket *tb2; struct sock *sk_arr[SKARR_SZ]; int num_arr[SKARR_SZ]; int idx, accum, res; resume_bind_walk: num = 0; accum = 0; ibb = &hashinfo->bhash2[i]; if (hlist_empty(&ibb->chain)) { s_num = 0; continue; } spin_lock_bh(&ibb->lock); inet_bind_bucket_for_each(tb2, &ibb->chain) { if (!net_eq(ib2_net(tb2), net)) continue; sk_for_each_bound(sk, &tb2->owners) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) goto next_bind; if (sk->sk_state != TCP_CLOSE || !inet->inet_num) goto next_bind; if (r->sdiag_family != AF_UNSPEC && r->sdiag_family != sk->sk_family) goto next_bind; if (!inet_diag_bc_sk(bc, sk)) goto next_bind; sock_hold(sk); num_arr[accum] = num; sk_arr[accum] = sk; if (++accum == SKARR_SZ) goto pause_bind_walk; next_bind: num++; } } pause_bind_walk: spin_unlock_bh(&ibb->lock); res = 0; for (idx = 0; idx < accum; idx++) { if (res >= 0) { res = inet_sk_diag_fill(sk_arr[idx], NULL, skb, cb, r, NLM_F_MULTI, net_admin); if (res < 0) num = num_arr[idx]; } sock_put(sk_arr[idx]); } if (res < 0) goto done; cond_resched(); if (accum == SKARR_SZ) { s_num = num + 1; goto resume_bind_walk; } s_num = 0; } skip_bind_ht: cb->args[0] = 2; s_i = num = s_num = 0; } if (!(idiag_states & ~TCPF_LISTEN)) goto out; for (i = s_i; i <= hashinfo->ehash_mask; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; spinlock_t *lock = inet_ehash_lockp(hashinfo, i); struct hlist_nulls_node *node; struct sock *sk_arr[SKARR_SZ]; int num_arr[SKARR_SZ]; int idx, accum, res; if (hlist_nulls_empty(&head->chain)) continue; if (i > s_i) s_num = 0; next_chunk: num = 0; accum = 0; spin_lock_bh(lock); sk_nulls_for_each(sk, node, &head->chain) { int state; if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next_normal; state = (sk->sk_state == TCP_TIME_WAIT) ? READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state; if (!(idiag_states & (1 << state))) goto next_normal; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next_normal; if (r->id.idiag_sport != htons(sk->sk_num) && r->id.idiag_sport) goto next_normal; if (r->id.idiag_dport != sk->sk_dport && r->id.idiag_dport) goto next_normal; twsk_build_assert(); if (!inet_diag_bc_sk(bc, sk)) goto next_normal; if (!refcount_inc_not_zero(&sk->sk_refcnt)) goto next_normal; num_arr[accum] = num; sk_arr[accum] = sk; if (++accum == SKARR_SZ) break; next_normal: ++num; } spin_unlock_bh(lock); res = 0; for (idx = 0; idx < accum; idx++) { if (res >= 0) { res = sk_diag_fill(sk_arr[idx], skb, cb, r, NLM_F_MULTI, net_admin); if (res < 0) num = num_arr[idx]; } sock_gen_put(sk_arr[idx]); } if (res < 0) break; cond_resched(); if (accum == SKARR_SZ) { s_num = num + 1; goto next_chunk; } } done: cb->args[1] = i; cb->args[2] = num; out: ; } EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { struct inet_diag_dump_data *cb_data = cb->data; const struct inet_diag_handler *handler; u32 prev_min_dump_alloc; int protocol, err = 0; protocol = inet_diag_get_protocol(r, cb_data); again: prev_min_dump_alloc = cb->min_dump_alloc; handler = inet_diag_lock_handler(protocol); if (handler) { handler->dump(skb, cb, r); inet_diag_unlock_handler(handler); } else { err = -ENOENT; } /* The skb is not large enough to fit one sk info and * inet_sk_diag_fill() has requested for a larger skb. */ if (!skb->len && cb->min_dump_alloc > prev_min_dump_alloc) { err = pskb_expand_head(skb, 0, cb->min_dump_alloc, GFP_KERNEL); if (!err) goto again; } return err ? : skb->len; } static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh)); } static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) { const struct nlmsghdr *nlh = cb->nlh; struct inet_diag_dump_data *cb_data; struct sk_buff *skb = cb->skb; struct nlattr *nla; int err; cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL); if (!cb_data) return -ENOMEM; err = inet_diag_parse_attrs(nlh, hdrlen, cb_data->req_nlas); if (err) { kfree(cb_data); return err; } nla = cb_data->inet_diag_nla_bc; if (nla) { err = inet_diag_bc_audit(nla, skb); if (err) { kfree(cb_data); return err; } } nla = cb_data->inet_diag_nla_bpf_stgs; if (nla) { struct bpf_sk_storage_diag *bpf_stg_diag; bpf_stg_diag = bpf_sk_storage_diag_alloc(nla); if (IS_ERR(bpf_stg_diag)) { kfree(cb_data); return PTR_ERR(bpf_stg_diag); } cb_data->bpf_stg_diag = bpf_stg_diag; } cb->data = cb_data; return 0; } static int inet_diag_dump_start(struct netlink_callback *cb) { return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req_v2)); } static int inet_diag_dump_start_compat(struct netlink_callback *cb) { return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req)); } static int inet_diag_dump_done(struct netlink_callback *cb) { struct inet_diag_dump_data *cb_data = cb->data; bpf_sk_storage_diag_free(cb_data->bpf_stg_diag); kfree(cb->data); return 0; } static int inet_diag_type2proto(int type) { switch (type) { case TCPDIAG_GETSOCK: return IPPROTO_TCP; case DCCPDIAG_GETSOCK: return IPPROTO_DCCP; default: return 0; } } static int inet_diag_dump_compat(struct sk_buff *skb, struct netlink_callback *cb) { struct inet_diag_req *rc = nlmsg_data(cb->nlh); struct inet_diag_req_v2 req; req.sdiag_family = AF_UNSPEC; /* compatibility */ req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type); req.idiag_ext = rc->idiag_ext; req.pad = 0; req.idiag_states = rc->idiag_states; req.id = rc->id; return __inet_diag_dump(skb, cb, &req); } static int inet_diag_get_exact_compat(struct sk_buff *in_skb, const struct nlmsghdr *nlh) { struct inet_diag_req *rc = nlmsg_data(nlh); struct inet_diag_req_v2 req; req.sdiag_family = rc->idiag_family; req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type); req.idiag_ext = rc->idiag_ext; req.pad = 0; req.idiag_states = rc->idiag_states; req.id = rc->id; return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, sizeof(struct inet_diag_req), &req); } static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh) { int hdrlen = sizeof(struct inet_diag_req); struct net *net = sock_net(skb->sk); if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX || nlmsg_len(nlh) < hdrlen) return -EINVAL; if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = inet_diag_dump_start_compat, .done = inet_diag_dump_done, .dump = inet_diag_dump_compat, }; return netlink_dump_start(net->diag_nlsk, skb, nlh, &c); } return inet_diag_get_exact_compat(skb, nlh); } static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct inet_diag_req_v2); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY && h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = inet_diag_dump_start, .done = inet_diag_dump_done, .dump = inet_diag_dump, }; return netlink_dump_start(net->diag_nlsk, skb, h, &c); } return inet_diag_cmd_exact(h->nlmsg_type, skb, h, hdrlen, nlmsg_data(h)); } static int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) { const struct inet_diag_handler *handler; struct nlmsghdr *nlh; struct nlattr *attr; struct inet_diag_msg *r; void *info = NULL; int err = 0; nlh = nlmsg_put(skb, 0, 0, SOCK_DIAG_BY_FAMILY, sizeof(*r), 0); if (!nlh) return -ENOMEM; r = nlmsg_data(nlh); memset(r, 0, sizeof(*r)); inet_diag_msg_common_fill(r, sk); if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_STREAM) r->id.idiag_sport = inet_sk(sk)->inet_sport; r->idiag_state = sk->sk_state; if ((err = nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))) { nlmsg_cancel(skb, nlh); return err; } handler = inet_diag_lock_handler(sk->sk_protocol); if (!handler) { nlmsg_cancel(skb, nlh); return -ENOENT; } attr = handler->idiag_info_size ? nla_reserve_64bit(skb, INET_DIAG_INFO, handler->idiag_info_size, INET_DIAG_PAD) : NULL; if (attr) info = nla_data(attr); handler->idiag_get_info(sk, r, info); inet_diag_unlock_handler(handler); nlmsg_end(skb, nlh); return 0; } static const struct sock_diag_handler inet_diag_handler = { .owner = THIS_MODULE, .family = AF_INET, .dump = inet_diag_handler_cmd, .get_info = inet_diag_handler_get_info, .destroy = inet_diag_handler_cmd, }; static const struct sock_diag_handler inet6_diag_handler = { .owner = THIS_MODULE, .family = AF_INET6, .dump = inet_diag_handler_cmd, .get_info = inet_diag_handler_get_info, .destroy = inet_diag_handler_cmd, }; int inet_diag_register(const struct inet_diag_handler *h) { const __u16 type = h->idiag_type; if (type >= IPPROTO_MAX) return -EINVAL; return !cmpxchg((const struct inet_diag_handler **)&inet_diag_table[type], NULL, h) ? 0 : -EEXIST; } EXPORT_SYMBOL_GPL(inet_diag_register); void inet_diag_unregister(const struct inet_diag_handler *h) { const __u16 type = h->idiag_type; if (type >= IPPROTO_MAX) return; xchg((const struct inet_diag_handler **)&inet_diag_table[type], NULL); } EXPORT_SYMBOL_GPL(inet_diag_unregister); static const struct sock_diag_inet_compat inet_diag_compat = { .owner = THIS_MODULE, .fn = inet_diag_rcv_msg_compat, }; static int __init inet_diag_init(void) { const int inet_diag_table_size = (IPPROTO_MAX * sizeof(struct inet_diag_handler *)); int err = -ENOMEM; inet_diag_table = kzalloc(inet_diag_table_size, GFP_KERNEL); if (!inet_diag_table) goto out; err = sock_diag_register(&inet_diag_handler); if (err) goto out_free_nl; err = sock_diag_register(&inet6_diag_handler); if (err) goto out_free_inet; sock_diag_register_inet_compat(&inet_diag_compat); out: return err; out_free_inet: sock_diag_unregister(&inet_diag_handler); out_free_nl: kfree(inet_diag_table); goto out; } static void __exit inet_diag_exit(void) { sock_diag_unregister(&inet6_diag_handler); sock_diag_unregister(&inet_diag_handler); sock_diag_unregister_inet_compat(&inet_diag_compat); kfree(inet_diag_table); } module_init(inet_diag_init); module_exit(inet_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("INET/INET6: socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */); |
14 14 14 14 14 14 14 14 85 85 85 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/net/sunrpc/stats.c * * procfs-based user access to generic RPC statistics. The stats files * reside in /proc/net/rpc. * * The read routines assume that the buffer passed in is just big enough. * If you implement an RPC service that has its own stats routine which * appends the generic RPC stats, make sure you don't exceed the PAGE_SIZE * limit. * * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de> */ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svcsock.h> #include <linux/sunrpc/metrics.h> #include <linux/rcupdate.h> #include <trace/events/sunrpc.h> #include "netns.h" #define RPCDBG_FACILITY RPCDBG_MISC /* * Get RPC client stats */ static int rpc_proc_show(struct seq_file *seq, void *v) { const struct rpc_stat *statp = seq->private; const struct rpc_program *prog = statp->program; unsigned int i, j; seq_printf(seq, "net %u %u %u %u\n", statp->netcnt, statp->netudpcnt, statp->nettcpcnt, statp->nettcpconn); seq_printf(seq, "rpc %u %u %u\n", statp->rpccnt, statp->rpcretrans, statp->rpcauthrefresh); for (i = 0; i < prog->nrvers; i++) { const struct rpc_version *vers = prog->version[i]; if (!vers) continue; seq_printf(seq, "proc%u %u", vers->number, vers->nrprocs); for (j = 0; j < vers->nrprocs; j++) seq_printf(seq, " %u", vers->counts[j]); seq_putc(seq, '\n'); } return 0; } static int rpc_proc_open(struct inode *inode, struct file *file) { return single_open(file, rpc_proc_show, pde_data(inode)); } static const struct proc_ops rpc_proc_ops = { .proc_open = rpc_proc_open, .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, }; /* * Get RPC server stats */ void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { const struct svc_program *prog = statp->program; const struct svc_version *vers; unsigned int i, j, k; unsigned long count; seq_printf(seq, "net %u %u %u %u\n", statp->netcnt, statp->netudpcnt, statp->nettcpcnt, statp->nettcpconn); seq_printf(seq, "rpc %u %u %u %u %u\n", statp->rpccnt, statp->rpcbadfmt+statp->rpcbadauth+statp->rpcbadclnt, statp->rpcbadfmt, statp->rpcbadauth, statp->rpcbadclnt); for (i = 0; i < prog->pg_nvers; i++) { vers = prog->pg_vers[i]; if (!vers) continue; seq_printf(seq, "proc%d %u", i, vers->vs_nproc); for (j = 0; j < vers->vs_nproc; j++) { count = 0; for_each_possible_cpu(k) count += per_cpu(vers->vs_count[j], k); seq_printf(seq, " %lu", count); } seq_putc(seq, '\n'); } } EXPORT_SYMBOL_GPL(svc_seq_show); /** * rpc_alloc_iostats - allocate an rpc_iostats structure * @clnt: RPC program, version, and xprt * */ struct rpc_iostats *rpc_alloc_iostats(struct rpc_clnt *clnt) { struct rpc_iostats *stats; int i; stats = kcalloc(clnt->cl_maxproc, sizeof(*stats), GFP_KERNEL); if (stats) { for (i = 0; i < clnt->cl_maxproc; i++) spin_lock_init(&stats[i].om_lock); } return stats; } EXPORT_SYMBOL_GPL(rpc_alloc_iostats); /** * rpc_free_iostats - release an rpc_iostats structure * @stats: doomed rpc_iostats structure * */ void rpc_free_iostats(struct rpc_iostats *stats) { kfree(stats); } EXPORT_SYMBOL_GPL(rpc_free_iostats); /** * rpc_count_iostats_metrics - tally up per-task stats * @task: completed rpc_task * @op_metrics: stat structure for OP that will accumulate stats from @task */ void rpc_count_iostats_metrics(const struct rpc_task *task, struct rpc_iostats *op_metrics) { struct rpc_rqst *req = task->tk_rqstp; ktime_t backlog, execute, now; if (!op_metrics || !req) return; now = ktime_get(); spin_lock(&op_metrics->om_lock); op_metrics->om_ops++; /* kernel API: om_ops must never become larger than om_ntrans */ op_metrics->om_ntrans += max(req->rq_ntrans, 1); op_metrics->om_timeouts += task->tk_timeouts; op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent; op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd; backlog = 0; if (ktime_to_ns(req->rq_xtime)) { backlog = ktime_sub(req->rq_xtime, task->tk_start); op_metrics->om_queue = ktime_add(op_metrics->om_queue, backlog); } op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt); execute = ktime_sub(now, task->tk_start); op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute); if (task->tk_status < 0) op_metrics->om_error_status++; spin_unlock(&op_metrics->om_lock); trace_rpc_stats_latency(req->rq_task, backlog, req->rq_rtt, execute); } EXPORT_SYMBOL_GPL(rpc_count_iostats_metrics); /** * rpc_count_iostats - tally up per-task stats * @task: completed rpc_task * @stats: array of stat structures * * Uses the statidx from @task */ void rpc_count_iostats(const struct rpc_task *task, struct rpc_iostats *stats) { rpc_count_iostats_metrics(task, &stats[task->tk_msg.rpc_proc->p_statidx]); } EXPORT_SYMBOL_GPL(rpc_count_iostats); static void _print_name(struct seq_file *seq, unsigned int op, const struct rpc_procinfo *procs) { if (procs[op].p_name) seq_printf(seq, "\t%12s: ", procs[op].p_name); else if (op == 0) seq_printf(seq, "\t NULL: "); else seq_printf(seq, "\t%12u: ", op); } static void _add_rpc_iostats(struct rpc_iostats *a, struct rpc_iostats *b) { a->om_ops += b->om_ops; a->om_ntrans += b->om_ntrans; a->om_timeouts += b->om_timeouts; a->om_bytes_sent += b->om_bytes_sent; a->om_bytes_recv += b->om_bytes_recv; a->om_queue = ktime_add(a->om_queue, b->om_queue); a->om_rtt = ktime_add(a->om_rtt, b->om_rtt); a->om_execute = ktime_add(a->om_execute, b->om_execute); a->om_error_status += b->om_error_status; } static void _print_rpc_iostats(struct seq_file *seq, struct rpc_iostats *stats, int op, const struct rpc_procinfo *procs) { _print_name(seq, op, procs); seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %llu %lu\n", stats->om_ops, stats->om_ntrans, stats->om_timeouts, stats->om_bytes_sent, stats->om_bytes_recv, ktime_to_ms(stats->om_queue), ktime_to_ms(stats->om_rtt), ktime_to_ms(stats->om_execute), stats->om_error_status); } static int do_print_stats(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *seqv) { struct seq_file *seq = seqv; xprt->ops->print_stats(xprt, seq); return 0; } void rpc_clnt_show_stats(struct seq_file *seq, struct rpc_clnt *clnt) { unsigned int op, maxproc = clnt->cl_maxproc; if (!clnt->cl_metrics) return; seq_printf(seq, "\tRPC iostats version: %s ", RPC_IOSTATS_VERS); seq_printf(seq, "p/v: %u/%u (%s)\n", clnt->cl_prog, clnt->cl_vers, clnt->cl_program->name); rpc_clnt_iterate_for_each_xprt(clnt, do_print_stats, seq); seq_printf(seq, "\tper-op statistics\n"); for (op = 0; op < maxproc; op++) { struct rpc_iostats stats = {}; struct rpc_clnt *next = clnt; do { _add_rpc_iostats(&stats, &next->cl_metrics[op]); if (next == next->cl_parent) break; next = next->cl_parent; } while (next); _print_rpc_iostats(seq, &stats, op, clnt->cl_procinfo); } } EXPORT_SYMBOL_GPL(rpc_clnt_show_stats); /* * Register/unregister RPC proc files */ static inline struct proc_dir_entry * do_register(struct net *net, const char *name, void *data, const struct proc_ops *proc_ops) { struct sunrpc_net *sn; dprintk("RPC: registering /proc/net/rpc/%s\n", name); sn = net_generic(net, sunrpc_net_id); return proc_create_data(name, 0, sn->proc_net_rpc, proc_ops, data); } struct proc_dir_entry * rpc_proc_register(struct net *net, struct rpc_stat *statp) { return do_register(net, statp->program->name, statp, &rpc_proc_ops); } EXPORT_SYMBOL_GPL(rpc_proc_register); void rpc_proc_unregister(struct net *net, const char *name) { struct sunrpc_net *sn; sn = net_generic(net, sunrpc_net_id); remove_proc_entry(name, sn->proc_net_rpc); } EXPORT_SYMBOL_GPL(rpc_proc_unregister); struct proc_dir_entry * svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops) { return do_register(net, statp->program->pg_name, net, proc_ops); } EXPORT_SYMBOL_GPL(svc_proc_register); void svc_proc_unregister(struct net *net, const char *name) { struct sunrpc_net *sn; sn = net_generic(net, sunrpc_net_id); remove_proc_entry(name, sn->proc_net_rpc); } EXPORT_SYMBOL_GPL(svc_proc_unregister); int rpc_proc_init(struct net *net) { struct sunrpc_net *sn; dprintk("RPC: registering /proc/net/rpc\n"); sn = net_generic(net, sunrpc_net_id); sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net); if (sn->proc_net_rpc == NULL) return -ENOMEM; return 0; } void rpc_proc_exit(struct net *net) { dprintk("RPC: unregistering /proc/net/rpc\n"); remove_proc_entry("rpc", net->proc_net); } |
7 7 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | // SPDX-License-Identifier: GPL-2.0-only /* * Netlink interface for IEEE 802.15.4 stack * * Copyright 2007, 2008 Siemens AG * * Written by: * Sergey Lapin <slapin@ossfans.org> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Maxim Osipov <maxim.osipov@siemens.com> */ #include <linux/kernel.h> #include <linux/gfp.h> #include <net/genetlink.h> #include <linux/nl802154.h> #include "ieee802154.h" static unsigned int ieee802154_seq_num; static DEFINE_SPINLOCK(ieee802154_seq_lock); /* Requests to userspace */ struct sk_buff *ieee802154_nl_create(int flags, u8 req) { void *hdr; struct sk_buff *msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); unsigned long f; if (!msg) return NULL; spin_lock_irqsave(&ieee802154_seq_lock, f); hdr = genlmsg_put(msg, 0, ieee802154_seq_num++, &nl802154_family, flags, req); spin_unlock_irqrestore(&ieee802154_seq_lock, f); if (!hdr) { nlmsg_free(msg); return NULL; } return msg; } int ieee802154_nl_mcast(struct sk_buff *msg, unsigned int group) { struct nlmsghdr *nlh = nlmsg_hdr(msg); void *hdr = genlmsg_data(nlmsg_data(nlh)); genlmsg_end(msg, hdr); return genlmsg_multicast(&nl802154_family, msg, 0, group, GFP_ATOMIC); } struct sk_buff *ieee802154_nl_new_reply(struct genl_info *info, int flags, u8 req) { void *hdr; struct sk_buff *msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return NULL; hdr = genlmsg_put_reply(msg, info, &nl802154_family, flags, req); if (!hdr) { nlmsg_free(msg); return NULL; } return msg; } int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info) { struct nlmsghdr *nlh = nlmsg_hdr(msg); void *hdr = genlmsg_data(nlmsg_data(nlh)); genlmsg_end(msg, hdr); return genlmsg_reply(msg, info); } static const struct genl_small_ops ieee802154_ops[] = { /* see nl-phy.c */ IEEE802154_DUMP(IEEE802154_LIST_PHY, ieee802154_list_phy, ieee802154_dump_phy), IEEE802154_OP(IEEE802154_ADD_IFACE, ieee802154_add_iface), IEEE802154_OP(IEEE802154_DEL_IFACE, ieee802154_del_iface), /* see nl-mac.c */ IEEE802154_OP(IEEE802154_ASSOCIATE_REQ, ieee802154_associate_req), IEEE802154_OP(IEEE802154_ASSOCIATE_RESP, ieee802154_associate_resp), IEEE802154_OP(IEEE802154_DISASSOCIATE_REQ, ieee802154_disassociate_req), IEEE802154_OP(IEEE802154_SCAN_REQ, ieee802154_scan_req), IEEE802154_OP(IEEE802154_START_REQ, ieee802154_start_req), IEEE802154_DUMP(IEEE802154_LIST_IFACE, ieee802154_list_iface, ieee802154_dump_iface), IEEE802154_OP(IEEE802154_SET_MACPARAMS, ieee802154_set_macparams), IEEE802154_OP(IEEE802154_LLSEC_GETPARAMS, ieee802154_llsec_getparams), IEEE802154_OP(IEEE802154_LLSEC_SETPARAMS, ieee802154_llsec_setparams), IEEE802154_DUMP(IEEE802154_LLSEC_LIST_KEY, NULL, ieee802154_llsec_dump_keys), IEEE802154_OP(IEEE802154_LLSEC_ADD_KEY, ieee802154_llsec_add_key), IEEE802154_OP(IEEE802154_LLSEC_DEL_KEY, ieee802154_llsec_del_key), IEEE802154_DUMP(IEEE802154_LLSEC_LIST_DEV, NULL, ieee802154_llsec_dump_devs), IEEE802154_OP(IEEE802154_LLSEC_ADD_DEV, ieee802154_llsec_add_dev), IEEE802154_OP(IEEE802154_LLSEC_DEL_DEV, ieee802154_llsec_del_dev), IEEE802154_DUMP(IEEE802154_LLSEC_LIST_DEVKEY, NULL, ieee802154_llsec_dump_devkeys), IEEE802154_OP(IEEE802154_LLSEC_ADD_DEVKEY, ieee802154_llsec_add_devkey), IEEE802154_OP(IEEE802154_LLSEC_DEL_DEVKEY, ieee802154_llsec_del_devkey), IEEE802154_DUMP(IEEE802154_LLSEC_LIST_SECLEVEL, NULL, ieee802154_llsec_dump_seclevels), IEEE802154_OP(IEEE802154_LLSEC_ADD_SECLEVEL, ieee802154_llsec_add_seclevel), IEEE802154_OP(IEEE802154_LLSEC_DEL_SECLEVEL, ieee802154_llsec_del_seclevel), }; static const struct genl_multicast_group ieee802154_mcgrps[] = { [IEEE802154_COORD_MCGRP] = { .name = IEEE802154_MCAST_COORD_NAME, }, [IEEE802154_BEACON_MCGRP] = { .name = IEEE802154_MCAST_BEACON_NAME, }, }; struct genl_family nl802154_family __ro_after_init = { .hdrsize = 0, .name = IEEE802154_NL_NAME, .version = 1, .maxattr = IEEE802154_ATTR_MAX, .policy = ieee802154_policy, .module = THIS_MODULE, .small_ops = ieee802154_ops, .n_small_ops = ARRAY_SIZE(ieee802154_ops), .resv_start_op = IEEE802154_LLSEC_DEL_SECLEVEL + 1, .mcgrps = ieee802154_mcgrps, .n_mcgrps = ARRAY_SIZE(ieee802154_mcgrps), }; int __init ieee802154_nl_init(void) { return genl_register_family(&nl802154_family); } void ieee802154_nl_exit(void) { genl_unregister_family(&nl802154_family); } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | // SPDX-License-Identifier: GPL-2.0 /* * HugeTLB Vmemmap Optimization (HVO) * * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> */ #ifndef _LINUX_HUGETLB_VMEMMAP_H #define _LINUX_HUGETLB_VMEMMAP_H #include <linux/hugetlb.h> /* * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See * Documentation/mm/vmemmap_dedup.rst. */ #define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE #define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page)) #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio); long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios); void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio); void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { return pages_per_huge_page(h) * sizeof(struct page); } /* * Return how many vmemmap size associated with a HugeTLB page that can be * optimized and can be freed to the buddy allocator. */ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE; if (!is_power_of_2(sizeof(struct page))) return 0; return size > 0 ? size : 0; } #else static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { return 0; } static long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { list_splice_init(folio_list, non_hvo_folios); return 0; } static inline void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { } static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { } static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { return 0; } #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h) { return hugetlb_vmemmap_optimizable_size(h) != 0; } #endif /* _LINUX_HUGETLB_VMEMMAP_H */ |
14 132 120 120 32 86 5 11 11 421 433 14 26 27 27 27 27 27 27 17 1 1 15 1 15 1 14 30 13 17 30 2 28 28 28 18 9 18 9 2 30 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 8 7 7 7 97 9 26 30 30 9 9 11 8 19 11 27 14 27 27 27 16 12 11 16 16 6 16 15 16 16 27 28 9 9 7 2 2 7 95 96 96 11 11 11 11 11 11 11 30 30 17 17 11 11 111 62 26 63 109 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 | // SPDX-License-Identifier: GPL-2.0 /* * gendisk handling * * Portions Copyright (C) 2020 Christoph Hellwig */ #include <linux/module.h> #include <linux/ctype.h> #include <linux/fs.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/major.h> #include <linux/mutex.h> #include <linux/idr.h> #include <linux/log2.h> #include <linux/pm_runtime.h> #include <linux/badblocks.h> #include <linux/part_stat.h> #include <linux/blktrace_api.h> #include "blk-throttle.h" #include "blk.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" #include "blk-cgroup.h" static struct kobject *block_depr; /* * Unique, monotonically increasing sequential number associated with block * devices instances (i.e. incremented each time a device is attached). * Associating uevents with block devices in userspace is difficult and racy: * the uevent netlink socket is lossy, and on slow and overloaded systems has * a very high latency. * Block devices do not have exclusive owners in userspace, any process can set * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0 * can be reused again and again). * A userspace process setting up a block device and watching for its events * cannot thus reliably tell whether an event relates to the device it just set * up or another earlier instance with the same name. * This sequential number allows userspace processes to solve this problem, and * uniquely associate an uevent to the lifetime to a device. */ static atomic64_t diskseq; /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) static DEFINE_IDA(ext_devt_ida); void set_capacity(struct gendisk *disk, sector_t sectors) { if (sectors > BLK_DEV_MAX_SECTORS) { pr_warn_once("%s: truncate capacity from %lld to %lld\n", disk->disk_name, sectors, BLK_DEV_MAX_SECTORS); sectors = BLK_DEV_MAX_SECTORS; } bdev_set_nr_sectors(disk->part0, sectors); } EXPORT_SYMBOL(set_capacity); /* * Set disk capacity and notify if the size is not currently zero and will not * be set to zero. Returns true if a uevent was sent, otherwise false. */ bool set_capacity_and_notify(struct gendisk *disk, sector_t size) { sector_t capacity = get_capacity(disk); char *envp[] = { "RESIZE=1", NULL }; set_capacity(disk, size); /* * Only print a message and send a uevent if the gendisk is user visible * and alive. This avoids spamming the log and udev when setting the * initial capacity during probing. */ if (size == capacity || !disk_live(disk) || (disk->flags & GENHD_FL_HIDDEN)) return false; pr_info("%s: detected capacity change from %lld to %lld\n", disk->disk_name, capacity, size); /* * Historically we did not send a uevent for changes to/from an empty * device. */ if (!capacity || !size) return false; kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); return true; } EXPORT_SYMBOL_GPL(set_capacity_and_notify); static void part_stat_read_all(struct block_device *part, struct disk_stats *stat) { int cpu; memset(stat, 0, sizeof(struct disk_stats)); for_each_possible_cpu(cpu) { struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu); int group; for (group = 0; group < NR_STAT_GROUPS; group++) { stat->nsecs[group] += ptr->nsecs[group]; stat->sectors[group] += ptr->sectors[group]; stat->ios[group] += ptr->ios[group]; stat->merges[group] += ptr->merges[group]; } stat->io_ticks += ptr->io_ticks; } } unsigned int part_in_flight(struct block_device *part) { unsigned int inflight = 0; int cpu; for_each_possible_cpu(cpu) { inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) + part_stat_local_read_cpu(part, in_flight[1], cpu); } if ((int)inflight < 0) inflight = 0; return inflight; } static void part_in_flight_rw(struct block_device *part, unsigned int inflight[2]) { int cpu; inflight[0] = 0; inflight[1] = 0; for_each_possible_cpu(cpu) { inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu); inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu); } if ((int)inflight[0] < 0) inflight[0] = 0; if ((int)inflight[1] < 0) inflight[1] = 0; } /* * Can be deleted altogether. Later. * */ #define BLKDEV_MAJOR_HASH_SIZE 255 static struct blk_major_name { struct blk_major_name *next; int major; char name[16]; #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD void (*probe)(dev_t devt); #endif } *major_names[BLKDEV_MAJOR_HASH_SIZE]; static DEFINE_MUTEX(major_names_lock); static DEFINE_SPINLOCK(major_names_spinlock); /* index in the above - for now: assume no multimajor ranges */ static inline int major_to_index(unsigned major) { return major % BLKDEV_MAJOR_HASH_SIZE; } #ifdef CONFIG_PROC_FS void blkdev_show(struct seq_file *seqf, off_t offset) { struct blk_major_name *dp; spin_lock(&major_names_spinlock); for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next) if (dp->major == offset) seq_printf(seqf, "%3d %s\n", dp->major, dp->name); spin_unlock(&major_names_spinlock); } #endif /* CONFIG_PROC_FS */ /** * __register_blkdev - register a new block device * * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If * @major = 0, try to allocate any unused major number. * @name: the name of the new block device as a zero terminated string * @probe: pre-devtmpfs / pre-udev callback used to create disks when their * pre-created device node is accessed. When a probe call uses * add_disk() and it fails the driver must cleanup resources. This * interface may soon be removed. * * The @name must be unique within the system. * * The return value depends on the @major input parameter: * * - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1] * then the function returns zero on success, or a negative error code * - if any unused major number was requested with @major = 0 parameter * then the return value is the allocated major number in range * [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise * * See Documentation/admin-guide/devices.txt for the list of allocated * major numbers. * * Use register_blkdev instead for any new code. */ int __register_blkdev(unsigned int major, const char *name, void (*probe)(dev_t devt)) { struct blk_major_name **n, *p; int index, ret = 0; mutex_lock(&major_names_lock); /* temporary */ if (major == 0) { for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) { if (major_names[index] == NULL) break; } if (index == 0) { printk("%s: failed to get major for %s\n", __func__, name); ret = -EBUSY; goto out; } major = index; ret = major; } if (major >= BLKDEV_MAJOR_MAX) { pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n", __func__, major, BLKDEV_MAJOR_MAX-1, name); ret = -EINVAL; goto out; } p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL); if (p == NULL) { ret = -ENOMEM; goto out; } p->major = major; #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD p->probe = probe; #endif strscpy(p->name, name, sizeof(p->name)); p->next = NULL; index = major_to_index(major); spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) { if ((*n)->major == major) break; } if (!*n) *n = p; else ret = -EBUSY; spin_unlock(&major_names_spinlock); if (ret < 0) { printk("register_blkdev: cannot get major %u for %s\n", major, name); kfree(p); } out: mutex_unlock(&major_names_lock); return ret; } EXPORT_SYMBOL(__register_blkdev); void unregister_blkdev(unsigned int major, const char *name) { struct blk_major_name **n; struct blk_major_name *p = NULL; int index = major_to_index(major); mutex_lock(&major_names_lock); spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) if ((*n)->major == major) break; if (!*n || strcmp((*n)->name, name)) { WARN_ON(1); } else { p = *n; *n = p->next; } spin_unlock(&major_names_spinlock); mutex_unlock(&major_names_lock); kfree(p); } EXPORT_SYMBOL(unregister_blkdev); int blk_alloc_ext_minor(void) { int idx; idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT - 1, GFP_KERNEL); if (idx == -ENOSPC) return -EBUSY; return idx; } void blk_free_ext_minor(unsigned int minor) { ida_free(&ext_devt_ida, minor); } void disk_uevent(struct gendisk *disk, enum kobject_action action) { struct block_device *part; unsigned long idx; rcu_read_lock(); xa_for_each(&disk->part_tbl, idx, part) { if (bdev_is_partition(part) && !bdev_nr_sectors(part)) continue; if (!kobject_get_unless_zero(&part->bd_device.kobj)) continue; rcu_read_unlock(); kobject_uevent(bdev_kobj(part), action); put_device(&part->bd_device); rcu_read_lock(); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(disk_uevent); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) { struct file *file; int ret = 0; if (!disk_has_partscan(disk)) return -EINVAL; if (disk->open_partitions) return -EBUSY; /* * If the device is opened exclusively by current thread already, it's * safe to scan partitons, otherwise, use bd_prepare_to_claim() to * synchronize with other exclusive openers and other partition * scanners. */ if (!(mode & BLK_OPEN_EXCL)) { ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions, NULL); if (ret) return ret; } set_bit(GD_NEED_PART_SCAN, &disk->state); file = bdev_file_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL, NULL); if (IS_ERR(file)) ret = PTR_ERR(file); else fput(file); /* * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set, * and this will cause that re-assemble partitioned raid device will * creat partition for underlying disk. */ clear_bit(GD_NEED_PART_SCAN, &disk->state); if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(disk->part0, disk_scan_partitions); return ret; } /** * add_disk_fwnode - add disk information to kernel list with fwnode * @parent: parent device for the disk * @disk: per-device partitioning information * @groups: Additional per-device sysfs groups * @fwnode: attached disk fwnode * * This function registers the partitioning information in @disk * with the kernel. Also attach a fwnode to the disk device. */ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk, const struct attribute_group **groups, struct fwnode_handle *fwnode) { struct device *ddev = disk_to_dev(disk); int ret; if (WARN_ON_ONCE(bdev_nr_sectors(disk->part0) > BLK_DEV_MAX_SECTORS)) return -EINVAL; if (queue_is_mq(disk->queue)) { /* * ->submit_bio and ->poll_bio are bypassed for blk-mq drivers. */ if (disk->fops->submit_bio || disk->fops->poll_bio) return -EINVAL; /* * Initialize the I/O scheduler code and pick a default one if * needed. */ elevator_init_mq(disk->queue); } else { if (!disk->fops->submit_bio) return -EINVAL; bdev_set_flag(disk->part0, BD_HAS_SUBMIT_BIO); } /* * If the driver provides an explicit major number it also must provide * the number of minors numbers supported, and those will be used to * setup the gendisk. * Otherwise just allocate the device numbers for both the whole device * and all partitions from the extended dev_t space. */ ret = -EINVAL; if (disk->major) { if (WARN_ON(!disk->minors)) goto out_exit_elevator; if (disk->minors > DISK_MAX_PARTS) { pr_err("block: can't allocate more than %d partitions\n", DISK_MAX_PARTS); disk->minors = DISK_MAX_PARTS; } if (disk->first_minor > MINORMASK || disk->minors > MINORMASK + 1 || disk->first_minor + disk->minors > MINORMASK + 1) goto out_exit_elevator; } else { if (WARN_ON(disk->minors)) goto out_exit_elevator; ret = blk_alloc_ext_minor(); if (ret < 0) goto out_exit_elevator; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = ret; } /* delay uevents, until we scanned partition table */ dev_set_uevent_suppress(ddev, 1); ddev->parent = parent; ddev->groups = groups; dev_set_name(ddev, "%s", disk->disk_name); if (fwnode) device_set_node(ddev, fwnode); if (!(disk->flags & GENHD_FL_HIDDEN)) ddev->devt = MKDEV(disk->major, disk->first_minor); ret = device_add(ddev); if (ret) goto out_free_ext_minor; ret = disk_alloc_events(disk); if (ret) goto out_device_del; ret = sysfs_create_link(block_depr, &ddev->kobj, kobject_name(&ddev->kobj)); if (ret) goto out_device_del; /* * avoid probable deadlock caused by allocating memory with * GFP_KERNEL in runtime_resume callback of its all ancestor * devices */ pm_runtime_set_memalloc_noio(ddev, true); disk->part0->bd_holder_dir = kobject_create_and_add("holders", &ddev->kobj); if (!disk->part0->bd_holder_dir) { ret = -ENOMEM; goto out_del_block_link; } disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); if (!disk->slave_dir) { ret = -ENOMEM; goto out_put_holder_dir; } ret = blk_register_queue(disk); if (ret) goto out_put_slave_dir; if (!(disk->flags & GENHD_FL_HIDDEN)) { ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); if (ret) goto out_unregister_queue; bdi_set_owner(disk->bdi, ddev); ret = sysfs_create_link(&ddev->kobj, &disk->bdi->dev->kobj, "bdi"); if (ret) goto out_unregister_bdi; /* Make sure the first partition scan will be proceed */ if (get_capacity(disk) && disk_has_partscan(disk)) set_bit(GD_NEED_PART_SCAN, &disk->state); bdev_add(disk->part0, ddev->devt); if (get_capacity(disk)) disk_scan_partitions(disk, BLK_OPEN_READ); /* * Announce the disk and partitions after all partitions are * created. (for hidden disks uevents remain suppressed forever) */ dev_set_uevent_suppress(ddev, 0); disk_uevent(disk, KOBJ_ADD); } else { /* * Even if the block_device for a hidden gendisk is not * registered, it needs to have a valid bd_dev so that the * freeing of the dynamic major works. */ disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor); } blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); disk_add_events(disk); set_bit(GD_ADDED, &disk->state); return 0; out_unregister_bdi: if (!(disk->flags & GENHD_FL_HIDDEN)) bdi_unregister(disk->bdi); out_unregister_queue: blk_unregister_queue(disk); rq_qos_exit(disk->queue); out_put_slave_dir: kobject_put(disk->slave_dir); disk->slave_dir = NULL; out_put_holder_dir: kobject_put(disk->part0->bd_holder_dir); out_del_block_link: sysfs_remove_link(block_depr, dev_name(ddev)); pm_runtime_set_memalloc_noio(ddev, false); out_device_del: device_del(ddev); out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); out_exit_elevator: if (disk->queue->elevator) elevator_exit(disk->queue); return ret; } EXPORT_SYMBOL_GPL(add_disk_fwnode); /** * device_add_disk - add disk information to kernel list * @parent: parent device for the disk * @disk: per-device partitioning information * @groups: Additional per-device sysfs groups * * This function registers the partitioning information in @disk * with the kernel. */ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, const struct attribute_group **groups) { return add_disk_fwnode(parent, disk, groups, NULL); } EXPORT_SYMBOL(device_add_disk); static void blk_report_disk_dead(struct gendisk *disk, bool surprise) { struct block_device *bdev; unsigned long idx; /* * On surprise disk removal, bdev_mark_dead() may call into file * systems below. Make it clear that we're expecting to not hold * disk->open_mutex. */ lockdep_assert_not_held(&disk->open_mutex); rcu_read_lock(); xa_for_each(&disk->part_tbl, idx, bdev) { if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) continue; rcu_read_unlock(); bdev_mark_dead(bdev, surprise); put_device(&bdev->bd_device); rcu_read_lock(); } rcu_read_unlock(); } static bool __blk_mark_disk_dead(struct gendisk *disk) { /* * Fail any new I/O. */ if (test_and_set_bit(GD_DEAD, &disk->state)) return false; if (test_bit(GD_OWNS_QUEUE, &disk->state)) blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue); /* * Stop buffered writers from dirtying pages that can't be written out. */ set_capacity(disk, 0); /* * Prevent new I/O from crossing bio_queue_enter(). */ return blk_queue_start_drain(disk->queue); } /** * blk_mark_disk_dead - mark a disk as dead * @disk: disk to mark as dead * * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O * to this disk. */ void blk_mark_disk_dead(struct gendisk *disk) { __blk_mark_disk_dead(disk); blk_report_disk_dead(disk, true); } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); /** * del_gendisk - remove the gendisk * @disk: the struct gendisk to remove * * Removes the gendisk and all its associated resources. This deletes the * partitions associated with the gendisk, and unregisters the associated * request_queue. * * This is the counter to the respective __device_add_disk() call. * * The final removal of the struct gendisk happens when its refcount reaches 0 * with put_disk(), which should be called after del_gendisk(), if * __device_add_disk() was used. * * Drivers exist which depend on the release of the gendisk to be synchronous, * it should not be deferred. * * Context: can sleep */ void del_gendisk(struct gendisk *disk) { struct request_queue *q = disk->queue; struct block_device *part; unsigned long idx; bool start_drain; might_sleep(); if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN))) return; disk_del_events(disk); /* * Prevent new openers by unlinked the bdev inode. */ mutex_lock(&disk->open_mutex); xa_for_each(&disk->part_tbl, idx, part) bdev_unhash(part); mutex_unlock(&disk->open_mutex); /* * Tell the file system to write back all dirty data and shut down if * it hasn't been notified earlier. */ if (!test_bit(GD_DEAD, &disk->state)) blk_report_disk_dead(disk, false); /* * Drop all partitions now that the disk is marked dead. */ mutex_lock(&disk->open_mutex); start_drain = __blk_mark_disk_dead(disk); if (start_drain) blk_freeze_acquire_lock(q); xa_for_each_start(&disk->part_tbl, idx, part, 1) drop_partition(part); mutex_unlock(&disk->open_mutex); if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); /* * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ bdi_unregister(disk->bdi); } blk_unregister_queue(disk); kobject_put(disk->part0->bd_holder_dir); kobject_put(disk->slave_dir); disk->slave_dir = NULL; part_stat_set_all(disk->part0, 0); disk->part0->bd_stamp = 0; sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); blk_mq_freeze_queue_wait(q); blk_throtl_cancel_bios(disk); blk_sync_queue(q); blk_flush_integrity(); if (queue_is_mq(q)) blk_mq_cancel_work_sync(q); blk_mq_quiesce_queue(q); if (q->elevator) { mutex_lock(&q->sysfs_lock); elevator_exit(q); mutex_unlock(&q->sysfs_lock); } rq_qos_exit(q); blk_mq_unquiesce_queue(q); /* * If the disk does not own the queue, allow using passthrough requests * again. Else leave the queue frozen to fail all I/O. */ if (!test_bit(GD_OWNS_QUEUE, &disk->state)) __blk_mq_unfreeze_queue(q, true); else if (queue_is_mq(q)) blk_mq_exit_queue(q); if (start_drain) blk_unfreeze_release_lock(q); } EXPORT_SYMBOL(del_gendisk); /** * invalidate_disk - invalidate the disk * @disk: the struct gendisk to invalidate * * A helper to invalidates the disk. It will clean the disk's associated * buffer/page caches and reset its internal states so that the disk * can be reused by the drivers. * * Context: can sleep */ void invalidate_disk(struct gendisk *disk) { struct block_device *bdev = disk->part0; invalidate_bdev(bdev); bdev->bd_mapping->wb_err = 0; set_capacity(disk, 0); } EXPORT_SYMBOL(invalidate_disk); /* sysfs access to bad-blocks list. */ static ssize_t disk_badblocks_show(struct device *dev, struct device_attribute *attr, char *page) { struct gendisk *disk = dev_to_disk(dev); if (!disk->bb) return sysfs_emit(page, "\n"); return badblocks_show(disk->bb, page, 0); } static ssize_t disk_badblocks_store(struct device *dev, struct device_attribute *attr, const char *page, size_t len) { struct gendisk *disk = dev_to_disk(dev); if (!disk->bb) return -ENXIO; return badblocks_store(disk->bb, page, len, 0); } #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD static bool blk_probe_dev(dev_t devt) { unsigned int major = MAJOR(devt); struct blk_major_name **n; mutex_lock(&major_names_lock); for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) { if ((*n)->major == major && (*n)->probe) { (*n)->probe(devt); mutex_unlock(&major_names_lock); return true; } } mutex_unlock(&major_names_lock); return false; } void blk_request_module(dev_t devt) { int error; if (blk_probe_dev(devt)) return; error = request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)); /* Make old-style 2.4 aliases work */ if (error > 0) error = request_module("block-major-%d", MAJOR(devt)); if (!error) blk_probe_dev(devt); } #endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */ #ifdef CONFIG_PROC_FS /* iterator */ static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos) { loff_t skip = *pos; struct class_dev_iter *iter; struct device *dev; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return ERR_PTR(-ENOMEM); seqf->private = iter; class_dev_iter_init(iter, &block_class, NULL, &disk_type); do { dev = class_dev_iter_next(iter); if (!dev) return NULL; } while (skip--); return dev_to_disk(dev); } static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos) { struct device *dev; (*pos)++; dev = class_dev_iter_next(seqf->private); if (dev) return dev_to_disk(dev); return NULL; } static void disk_seqf_stop(struct seq_file *seqf, void *v) { struct class_dev_iter *iter = seqf->private; /* stop is called even after start failed :-( */ if (iter) { class_dev_iter_exit(iter); kfree(iter); seqf->private = NULL; } } static void *show_partition_start(struct seq_file *seqf, loff_t *pos) { void *p; p = disk_seqf_start(seqf, pos); if (!IS_ERR_OR_NULL(p) && !*pos) seq_puts(seqf, "major minor #blocks name\n\n"); return p; } static int show_partition(struct seq_file *seqf, void *v) { struct gendisk *sgp = v; struct block_device *part; unsigned long idx; if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN)) return 0; rcu_read_lock(); xa_for_each(&sgp->part_tbl, idx, part) { if (!bdev_nr_sectors(part)) continue; seq_printf(seqf, "%4d %7d %10llu %pg\n", MAJOR(part->bd_dev), MINOR(part->bd_dev), bdev_nr_sectors(part) >> 1, part); } rcu_read_unlock(); return 0; } static const struct seq_operations partitions_op = { .start = show_partition_start, .next = disk_seqf_next, .stop = disk_seqf_stop, .show = show_partition }; #endif static int __init genhd_device_init(void) { int error; error = class_register(&block_class); if (unlikely(error)) return error; blk_dev_init(); register_blkdev(BLOCK_EXT_MAJOR, "blkext"); /* create top-level block dir */ block_depr = kobject_create_and_add("block", NULL); return 0; } subsys_initcall(genhd_device_init); static ssize_t disk_range_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", disk->minors); } static ssize_t disk_ext_range_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS); } static ssize_t disk_removable_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); } static ssize_t disk_hidden_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", (disk->flags & GENHD_FL_HIDDEN ? 1 : 0)); } static ssize_t disk_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); } ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); } ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); struct disk_stats stat; unsigned int inflight; inflight = part_in_flight(bdev); if (inflight) { part_stat_lock(); update_io_ticks(bdev, jiffies, true); part_stat_unlock(); } part_stat_read_all(bdev, &stat); return sysfs_emit(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " "%8u %8u %8u " "%8lu %8lu %8llu %8u " "%8lu %8u" "\n", stat.ios[STAT_READ], stat.merges[STAT_READ], (unsigned long long)stat.sectors[STAT_READ], (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC), stat.ios[STAT_WRITE], stat.merges[STAT_WRITE], (unsigned long long)stat.sectors[STAT_WRITE], (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC), inflight, jiffies_to_msecs(stat.io_ticks), (unsigned int)div_u64(stat.nsecs[STAT_READ] + stat.nsecs[STAT_WRITE] + stat.nsecs[STAT_DISCARD] + stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC), stat.ios[STAT_DISCARD], stat.merges[STAT_DISCARD], (unsigned long long)stat.sectors[STAT_DISCARD], (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC), stat.ios[STAT_FLUSH], (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); } ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); struct request_queue *q = bdev_get_queue(bdev); unsigned int inflight[2]; if (queue_is_mq(q)) blk_mq_in_flight_rw(q, bdev, inflight); else part_in_flight_rw(bdev, inflight); return sysfs_emit(buf, "%8u %8u\n", inflight[0], inflight[1]); } static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { dev_warn_once(dev, "the capability attribute has been deprecated.\n"); return sysfs_emit(buf, "0\n"); } static ssize_t disk_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t disk_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t diskseq_show(struct device *dev, struct device_attribute *attr, char *buf) { struct gendisk *disk = dev_to_disk(dev); return sysfs_emit(buf, "%llu\n", disk->diskseq); } static ssize_t partscan_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); } static DEVICE_ATTR(range, 0444, disk_range_show, NULL); static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL); static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL); static DEVICE_ATTR(size, 0444, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL); static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL); static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); static DEVICE_ATTR(partscan, 0444, partscan_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", bdev_test_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL)); } ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int i; if (count > 0 && sscanf(buf, "%d", &i) > 0) { if (i) bdev_set_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL); else bdev_clear_flag(dev_to_bdev(dev), BD_MAKE_IT_FAIL); } return count; } static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); #endif /* CONFIG_FAIL_MAKE_REQUEST */ #ifdef CONFIG_FAIL_IO_TIMEOUT static struct device_attribute dev_attr_fail_timeout = __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store); #endif static struct attribute *disk_attrs[] = { &dev_attr_range.attr, &dev_attr_ext_range.attr, &dev_attr_removable.attr, &dev_attr_hidden.attr, &dev_attr_ro.attr, &dev_attr_size.attr, &dev_attr_alignment_offset.attr, &dev_attr_discard_alignment.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, &dev_attr_badblocks.attr, &dev_attr_events.attr, &dev_attr_events_async.attr, &dev_attr_events_poll_msecs.attr, &dev_attr_diskseq.attr, &dev_attr_partscan.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif #ifdef CONFIG_FAIL_IO_TIMEOUT &dev_attr_fail_timeout.attr, #endif NULL }; static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, typeof(*dev), kobj); struct gendisk *disk = dev_to_disk(dev); if (a == &dev_attr_badblocks.attr && !disk->bb) return 0; return a->mode; } static struct attribute_group disk_attr_group = { .attrs = disk_attrs, .is_visible = disk_visible, }; static const struct attribute_group *disk_attr_groups[] = { &disk_attr_group, #ifdef CONFIG_BLK_DEV_IO_TRACE &blk_trace_attr_group, #endif #ifdef CONFIG_BLK_DEV_INTEGRITY &blk_integrity_attr_group, #endif NULL }; /** * disk_release - releases all allocated resources of the gendisk * @dev: the device representing this disk * * This function releases all allocated resources of the gendisk. * * Drivers which used __device_add_disk() have a gendisk with a request_queue * assigned. Since the request_queue sits on top of the gendisk for these * drivers we also call blk_put_queue() for them, and we expect the * request_queue refcount to reach 0 at this point, and so the request_queue * will also be freed prior to the disk. * * Context: can sleep */ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); might_sleep(); WARN_ON_ONCE(disk_live(disk)); blk_trace_remove(disk->queue); /* * To undo the all initialization from blk_mq_init_allocated_queue in * case of a probe failure where add_disk is never called we have to * call blk_mq_exit_queue here. We can't do this for the more common * teardown case (yet) as the tagset can be gone by the time the disk * is released once it was added. */ if (queue_is_mq(disk->queue) && test_bit(GD_OWNS_QUEUE, &disk->state) && !test_bit(GD_ADDED, &disk->state)) blk_mq_exit_queue(disk->queue); blkcg_exit_disk(disk); bioset_exit(&disk->bio_split); disk_release_events(disk); kfree(disk->random); disk_free_zone_resources(disk); xa_destroy(&disk->part_tbl); disk->queue->disk = NULL; blk_put_queue(disk->queue); if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk) disk->fops->free_disk(disk); bdev_drop(disk->part0); /* frees the disk */ } static int block_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct gendisk *disk = dev_to_disk(dev); return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); } const struct class block_class = { .name = "block", .dev_uevent = block_uevent, }; static char *block_devnode(const struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid) { struct gendisk *disk = dev_to_disk(dev); if (disk->fops->devnode) return disk->fops->devnode(disk, mode); return NULL; } const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, .devnode = block_devnode, }; #ifdef CONFIG_PROC_FS /* * aggregate disk stat collector. Uses the same stats that the sysfs * entries do, above, but makes them available through one seq_file. * * The output looks suspiciously like /proc/partitions with a bunch of * extra fields. */ static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; struct block_device *hd; unsigned int inflight; struct disk_stats stat; unsigned long idx; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) seq_puts(seqf, "major minor name" " rio rmerge rsect ruse wio wmerge " "wsect wuse running use aveq" "\n\n"); */ rcu_read_lock(); xa_for_each(&gp->part_tbl, idx, hd) { if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) continue; inflight = part_in_flight(hd); if (inflight) { part_stat_lock(); update_io_ticks(hd, jiffies, true); part_stat_unlock(); } part_stat_read_all(hd, &stat); seq_put_decimal_ull_width(seqf, "", MAJOR(hd->bd_dev), 4); seq_put_decimal_ull_width(seqf, " ", MINOR(hd->bd_dev), 7); seq_printf(seqf, " %pg", hd); seq_put_decimal_ull(seqf, " ", stat.ios[STAT_READ]); seq_put_decimal_ull(seqf, " ", stat.merges[STAT_READ]); seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_READ]); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC)); seq_put_decimal_ull(seqf, " ", stat.ios[STAT_WRITE]); seq_put_decimal_ull(seqf, " ", stat.merges[STAT_WRITE]); seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_WRITE]); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC)); seq_put_decimal_ull(seqf, " ", inflight); seq_put_decimal_ull(seqf, " ", jiffies_to_msecs(stat.io_ticks)); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_READ] + stat.nsecs[STAT_WRITE] + stat.nsecs[STAT_DISCARD] + stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); seq_put_decimal_ull(seqf, " ", stat.ios[STAT_DISCARD]); seq_put_decimal_ull(seqf, " ", stat.merges[STAT_DISCARD]); seq_put_decimal_ull(seqf, " ", stat.sectors[STAT_DISCARD]); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC)); seq_put_decimal_ull(seqf, " ", stat.ios[STAT_FLUSH]); seq_put_decimal_ull(seqf, " ", (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); seq_putc(seqf, '\n'); } rcu_read_unlock(); return 0; } static const struct seq_operations diskstats_op = { .start = disk_seqf_start, .next = disk_seqf_next, .stop = disk_seqf_stop, .show = diskstats_show }; static int __init proc_genhd_init(void) { proc_create_seq("diskstats", 0, NULL, &diskstats_op); proc_create_seq("partitions", 0, NULL, &partitions_op); return 0; } module_init(proc_genhd_init); #endif /* CONFIG_PROC_FS */ dev_t part_devt(struct gendisk *disk, u8 partno) { struct block_device *part; dev_t devt = 0; rcu_read_lock(); part = xa_load(&disk->part_tbl, partno); if (part) devt = part->bd_dev; rcu_read_unlock(); return devt; } struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass) { struct gendisk *disk; disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); if (!disk) return NULL; if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0)) goto out_free_disk; disk->bdi = bdi_alloc(node_id); if (!disk->bdi) goto out_free_bioset; /* bdev_alloc() might need the queue, set before the first call */ disk->queue = q; disk->part0 = bdev_alloc(disk, 0); if (!disk->part0) goto out_free_bdi; disk->node_id = node_id; mutex_init(&disk->open_mutex); xa_init(&disk->part_tbl); if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) goto out_destroy_part_tbl; if (blkcg_init_disk(disk)) goto out_erase_part0; disk_init_zone_resources(disk); rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); q->disk = disk; lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED INIT_LIST_HEAD(&disk->slave_bdevs); #endif return disk; out_erase_part0: xa_erase(&disk->part_tbl, 0); out_destroy_part_tbl: xa_destroy(&disk->part_tbl); disk->part0->bd_disk = NULL; bdev_drop(disk->part0); out_free_bdi: bdi_put(disk->bdi); out_free_bioset: bioset_exit(&disk->bio_split); out_free_disk: kfree(disk); return NULL; } struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, struct lock_class_key *lkclass) { struct queue_limits default_lim = { }; struct request_queue *q; struct gendisk *disk; q = blk_alloc_queue(lim ? lim : &default_lim, node); if (IS_ERR(q)) return ERR_CAST(q); disk = __alloc_disk_node(q, node, lkclass); if (!disk) { blk_put_queue(q); return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); return disk; } EXPORT_SYMBOL(__blk_alloc_disk); /** * put_disk - decrements the gendisk refcount * @disk: the struct gendisk to decrement the refcount for * * This decrements the refcount for the struct gendisk. When this reaches 0 * we'll have disk_release() called. * * Note: for blk-mq disk put_disk must be called before freeing the tag_set * when handling probe errors (that is before add_disk() is called). * * Context: Any context, but the last reference must not be dropped from * atomic context. */ void put_disk(struct gendisk *disk) { if (disk) put_device(disk_to_dev(disk)); } EXPORT_SYMBOL(put_disk); static void set_disk_ro_uevent(struct gendisk *gd, int ro) { char event[] = "DISK_RO=1"; char *envp[] = { event, NULL }; if (!ro) event[8] = '0'; kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); } /** * set_disk_ro - set a gendisk read-only * @disk: gendisk to operate on * @read_only: %true to set the disk read-only, %false set the disk read/write * * This function is used to indicate whether a given disk device should have its * read-only flag set. set_disk_ro() is typically used by device drivers to * indicate whether the underlying physical device is write-protected. */ void set_disk_ro(struct gendisk *disk, bool read_only) { if (read_only) { if (test_and_set_bit(GD_READ_ONLY, &disk->state)) return; } else { if (!test_and_clear_bit(GD_READ_ONLY, &disk->state)) return; } set_disk_ro_uevent(disk, read_only); } EXPORT_SYMBOL(set_disk_ro); void inc_diskseq(struct gendisk *disk) { disk->diskseq = atomic64_inc_return(&diskseq); } |
3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 | // SPDX-License-Identifier: GPL-2.0 /* * SVC Greybus driver. * * Copyright 2015 Google Inc. * Copyright 2015 Linaro Ltd. */ #include <linux/debugfs.h> #include <linux/kstrtox.h> #include <linux/workqueue.h> #include <linux/greybus.h> #define SVC_INTF_EJECT_TIMEOUT 9000 #define SVC_INTF_ACTIVATE_TIMEOUT 6000 #define SVC_INTF_RESUME_TIMEOUT 3000 struct gb_svc_deferred_request { struct work_struct work; struct gb_operation *operation; }; static int gb_svc_queue_deferred_request(struct gb_operation *operation); static ssize_t endo_id_show(struct device *dev, stru |