| 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 | /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/nfs_fs.h * * Copyright (C) 1992 Rick Sladkey * * OS-specific nfs filesystem definitions and declarations */ #ifndef _LINUX_NFS_FS_H #define _LINUX_NFS_FS_H #include <uapi/linux/nfs_fs.h> /* * Enable dprintk() debugging support for nfs client. */ #ifdef CONFIG_NFS_DEBUG # define NFS_DEBUG #endif #include <linux/in.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rbtree.h> #include <linux/refcount.h> #include <linux/rwsem.h> #include <linux/wait.h> #include <linux/sunrpc/debug.h> #include <linux/sunrpc/auth.h> #include <linux/sunrpc/clnt.h> #ifdef CONFIG_NFS_FSCACHE #include <linux/netfs.h> #endif #include <linux/nfs.h> #include <linux/nfs2.h> #include <linux/nfs3.h> #include <linux/nfs4.h> #include <linux/nfs_xdr.h> #include <linux/nfs_fs_sb.h> #include <linux/mempool.h> /* * These are the default for number of transports to different server IPs */ #define NFS_MAX_TRANSPORTS 16 /* * Size of the NFS directory verifier */ #define NFS_DIR_VERIFIER_SIZE 2 /* * NFSv3/v4 Access mode cache entry */ struct nfs_access_entry { struct rb_node rb_node; struct list_head lru; kuid_t fsuid; kgid_t fsgid; struct group_info *group_info; u64 timestamp; __u32 mask; struct rcu_head rcu_head; }; struct nfs_lock_context { refcount_t count; struct list_head list; struct nfs_open_context *open_context; fl_owner_t lockowner; atomic_t io_count; struct rcu_head rcu_head; }; struct nfs_file_localio { struct nfsd_file __rcu *ro_file; struct nfsd_file __rcu *rw_file; struct list_head list; void __rcu *nfs_uuid; /* opaque pointer to 'nfs_uuid_t' */ }; static inline void nfs_localio_file_init(struct nfs_file_localio *nfl) { #if IS_ENABLED(CONFIG_NFS_LOCALIO) nfl->ro_file = NULL; nfl->rw_file = NULL; INIT_LIST_HEAD(&nfl->list); nfl->nfs_uuid = NULL; #endif } struct nfs4_state; struct nfs_open_context { struct nfs_lock_context lock_context; fl_owner_t flock_owner; struct dentry *dentry; const struct cred *cred; struct rpc_cred __rcu *ll_cred; /* low-level cred - use to check for expiry */ struct nfs4_state *state; fmode_t mode; int error; unsigned long flags; #define NFS_CONTEXT_BAD (2) #define NFS_CONTEXT_UNLOCK (3) #define NFS_CONTEXT_FILE_OPEN (4) struct nfs4_threshold *mdsthreshold; struct list_head list; struct rcu_head rcu_head; struct nfs_file_localio nfl; }; struct nfs_open_dir_context { struct list_head list; atomic_t cache_hits; atomic_t cache_misses; unsigned long attr_gencount; __be32 verf[NFS_DIR_VERIFIER_SIZE]; __u64 dir_cookie; __u64 last_cookie; pgoff_t page_index; unsigned int dtsize; bool force_clear; bool eof; struct rcu_head rcu_head; }; /* * NFSv4 delegation */ struct nfs_delegation; struct posix_acl; struct nfs4_xattr_cache; /* * nfs fs inode data in memory */ struct nfs_inode { /* * The 64bit 'inode number' */ __u64 fileid; /* * NFS file handle */ struct nfs_fh fh; /* * Various flags */ unsigned long flags; /* atomic bit ops */ unsigned long cache_validity; /* bit mask */ /* * NFS Attributes not included in struct inode */ struct timespec64 btime; /* * read_cache_jiffies is when we started read-caching this inode. * attrtimeo is for how long the cached information is assumed * to be valid. A successful attribute revalidation doubles * attrtimeo (up to acregmax/acdirmax), a failure resets it to * acregmin/acdirmin. * * We need to revalidate the cached attrs for this inode if * * jiffies - read_cache_jiffies >= attrtimeo * * Please note the comparison is greater than or equal * so that zero timeout values can be specified. */ unsigned long read_cache_jiffies; unsigned long attrtimeo; unsigned long attrtimeo_timestamp; unsigned long attr_gencount; struct rb_root access_cache; struct list_head access_cache_entry_lru; struct list_head access_cache_inode_lru; union { /* Directory */ struct { /* "Generation counter" for the attribute cache. * This is bumped whenever we update the metadata * on the server. */ unsigned long cache_change_attribute; /* * This is the cookie verifier used for NFSv3 readdir * operations */ __be32 cookieverf[NFS_DIR_VERIFIER_SIZE]; /* Readers: in-flight sillydelete RPC calls */ /* Writers: rmdir */ struct rw_semaphore rmdir_sem; }; /* Regular file */ struct { atomic_long_t nrequests; atomic_long_t redirtied_pages; struct nfs_mds_commit_info commit_info; struct mutex commit_mutex; }; }; /* Open contexts for shared mmap writes */ struct list_head open_files; /* Keep track of out-of-order replies. * The ooo array contains start/end pairs of * numbers from the changeid sequence when * the inode's iversion has been updated. * It also contains end/start pair (i.e. reverse order) * of sections of the changeid sequence that have * been seen in replies from the server. * Normally these should match and when both * A:B and B:A are found in ooo, they are both removed. * And if a reply with A:B causes an iversion update * of A:B, then neither are added. * When a reply has pre_change that doesn't match * iversion, then the changeid pair and any consequent * change in iversion ARE added. Later replies * might fill in the gaps, or possibly a gap is caused * by a change from another client. * When a file or directory is opened, if the ooo table * is not empty, then we assume the gaps were due to * another client and we invalidate the cached data. * * We can only track a limited number of concurrent gaps. * Currently that limit is 16. * We allocate the table on demand. If there is insufficient * memory, then we probably cannot cache the file anyway * so there is no loss. */ struct { int cnt; struct { u64 start, end; } gap[16]; } *ooo; #if IS_ENABLED(CONFIG_NFS_V4) struct nfs4_cached_acl *nfs4_acl; /* NFSv4 state */ struct list_head open_states; struct nfs_delegation __rcu *delegation; struct rw_semaphore rwsem; /* pNFS layout information */ struct pnfs_layout_hdr *layout; #endif /* CONFIG_NFS_V4*/ /* how many bytes have been written/read and how many bytes queued up */ __u64 write_io; __u64 read_io; #ifdef CONFIG_NFS_V4_2 struct nfs4_xattr_cache *xattr_cache; #endif union { struct inode vfs_inode; #ifdef CONFIG_NFS_FSCACHE struct netfs_inode netfs; /* netfs context and VFS inode */ #endif }; }; struct nfs4_copy_state { struct list_head copies; struct list_head src_copies; nfs4_stateid stateid; struct completion completion; uint64_t count; struct nfs_writeverf verf; int error; int flags; struct nfs4_state *parent_src_state; struct nfs4_state *parent_dst_state; }; /* * Access bit flags */ #define NFS_ACCESS_READ 0x0001 #define NFS_ACCESS_LOOKUP 0x0002 #define NFS_ACCESS_MODIFY 0x0004 #define NFS_ACCESS_EXTEND 0x0008 #define NFS_ACCESS_DELETE 0x0010 #define NFS_ACCESS_EXECUTE 0x0020 #define NFS_ACCESS_XAREAD 0x0040 #define NFS_ACCESS_XAWRITE 0x0080 #define NFS_ACCESS_XALIST 0x0100 /* * Cache validity bit flags */ #define NFS_INO_INVALID_DATA BIT(1) /* cached data is invalid */ #define NFS_INO_INVALID_ATIME BIT(2) /* cached atime is invalid */ #define NFS_INO_INVALID_ACCESS BIT(3) /* cached access cred invalid */ #define NFS_INO_INVALID_ACL BIT(4) /* cached acls are invalid */ #define NFS_INO_REVAL_FORCED BIT(6) /* force revalidation ignoring a delegation */ #define NFS_INO_INVALID_LABEL BIT(7) /* cached label is invalid */ #define NFS_INO_INVALID_CHANGE BIT(8) /* cached change is invalid */ #define NFS_INO_INVALID_CTIME BIT(9) /* cached ctime is invalid */ #define NFS_INO_INVALID_MTIME BIT(10) /* cached mtime is invalid */ #define NFS_INO_INVALID_SIZE BIT(11) /* cached size is invalid */ #define NFS_INO_INVALID_OTHER BIT(12) /* other attrs are invalid */ #define NFS_INO_DATA_INVAL_DEFER \ BIT(13) /* Deferred cache invalidation */ #define NFS_INO_INVALID_BLOCKS BIT(14) /* cached blocks are invalid */ #define NFS_INO_INVALID_XATTR BIT(15) /* xattrs are invalid */ #define NFS_INO_INVALID_NLINK BIT(16) /* cached nlinks is invalid */ #define NFS_INO_INVALID_MODE BIT(17) /* cached mode is invalid */ #define NFS_INO_INVALID_BTIME BIT(18) /* cached btime is invalid */ #define NFS_INO_INVALID_ATTR (NFS_INO_INVALID_CHANGE \ | NFS_INO_INVALID_CTIME \ | NFS_INO_INVALID_MTIME \ | NFS_INO_INVALID_BTIME \ | NFS_INO_INVALID_SIZE \ | NFS_INO_INVALID_NLINK \ | NFS_INO_INVALID_MODE \ | NFS_INO_INVALID_OTHER) /* inode metadata is invalid */ /* * Bit offsets in flags field */ #define NFS_INO_STALE (1) /* possible stale inode */ #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ #define NFS_INO_PRESERVE_UNLINKED (4) /* preserve file if removed while open */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ #define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ #define NFS_INO_ODIRECT (12) /* I/O setting is O_DIRECT */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { return container_of(inode, struct nfs_inode, vfs_inode); } static inline struct nfs_server *NFS_SB(const struct super_block *s) { return (struct nfs_server *)(s->s_fs_info); } static inline struct nfs_fh *NFS_FH(const struct inode *inode) { return &NFS_I(inode)->fh; } static inline struct nfs_server *NFS_SERVER(const struct inode *inode) { return NFS_SB(inode->i_sb); } static inline struct rpc_clnt *NFS_CLIENT(const struct inode *inode) { return NFS_SERVER(inode)->client; } static inline const struct nfs_rpc_ops *NFS_PROTO(const struct inode *inode) { return NFS_SERVER(inode)->nfs_client->rpc_ops; } static inline unsigned NFS_MINATTRTIMEO(const struct inode *inode) { struct nfs_server *nfss = NFS_SERVER(inode); return S_ISDIR(inode->i_mode) ? nfss->acdirmin : nfss->acregmin; } static inline unsigned NFS_MAXATTRTIMEO(const struct inode *inode) { struct nfs_server *nfss = NFS_SERVER(inode); return S_ISDIR(inode->i_mode) ? nfss->acdirmax : nfss->acregmax; } static inline int NFS_STALE(const struct inode *inode) { return test_bit(NFS_INO_STALE, &NFS_I(inode)->flags); } static inline __u64 NFS_FILEID(const struct inode *inode) { return NFS_I(inode)->fileid; } static inline void set_nfs_fileid(struct inode *inode, __u64 fileid) { NFS_I(inode)->fileid = fileid; } static inline void nfs_mark_for_revalidate(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); nfsi->cache_validity |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_SIZE; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); } static inline int nfs_server_capable(const struct inode *inode, int cap) { return NFS_SERVER(inode)->caps & cap; } /** * nfs_save_change_attribute - Returns the inode attribute change cookie * @dir - pointer to parent directory inode * The "cache change attribute" is updated when we need to revalidate * our dentry cache after a directory was seen to change on the server. */ static inline unsigned long nfs_save_change_attribute(struct inode *dir) { return NFS_I(dir)->cache_change_attribute; } /* * linux/fs/nfs/inode.c */ extern int nfs_sync_mapping(struct address_space *mapping); extern void nfs_zap_mapping(struct inode *inode, struct address_space *mapping); extern void nfs_zap_caches(struct inode *); extern void nfs_set_inode_stale(struct inode *inode); extern void nfs_invalidate_atime(struct inode *); extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *, struct nfs_fattr *); struct inode *nfs_ilookup(struct super_block *sb, struct nfs_fattr *, struct nfs_fh *); extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *, const struct cred *); extern void nfs_access_set_mask(struct nfs_access_entry *, u32); extern int nfs_permission(struct mnt_idmap *, struct inode *, int); extern int nfs_open(struct inode *, struct file *); extern int nfs_attribute_cache_expired(struct inode *inode); extern int nfs_revalidate_inode(struct inode *inode, unsigned long flags); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int nfs_clear_invalid_mapping(struct address_space *mapping); extern bool nfs_mapping_need_revalidate_inode(struct inode *inode); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); extern int nfs_revalidate_mapping_rcu(struct inode *inode); extern int nfs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr); extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); extern void put_nfs_open_context(struct nfs_open_context *ctx); extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct cred *cred, fmode_t mode); extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode, struct file *filp); extern void nfs_inode_attach_open_context(struct nfs_open_context *ctx); extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx); extern void nfs_file_clear_open_context(struct file *flip); extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); extern u64 nfs_compat_user_ino64(u64 fileid); extern void nfs_fattr_init(struct nfs_fattr *fattr); extern void nfs_fattr_set_barrier(struct nfs_fattr *fattr); extern unsigned long nfs_inc_attr_generation_counter(void); extern struct nfs_fattr *nfs_alloc_fattr(void); extern struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server); static inline void nfs4_label_free(struct nfs4_label *label) { #ifdef CONFIG_NFS_V4_SECURITY_LABEL if (label) { kfree(label->label); kfree(label); } #endif } static inline void nfs_free_fattr(const struct nfs_fattr *fattr) { if (fattr) nfs4_label_free(fattr->label); kfree(fattr); } extern struct nfs_fh *nfs_alloc_fhandle(void); static inline void nfs_free_fhandle(const struct nfs_fh *fh) { kfree(fh); } #ifdef NFS_DEBUG extern u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh); static inline u32 nfs_display_fhandle_hash(const struct nfs_fh *fh) { return _nfs_display_fhandle_hash(fh); } extern void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption); #define nfs_display_fhandle(fh, caption) \ do { \ if (unlikely(nfs_debug & NFSDBG_FACILITY)) \ _nfs_display_fhandle(fh, caption); \ } while (0) #else static inline u32 nfs_display_fhandle_hash(const struct nfs_fh *fh) { return 0; } static inline void nfs_display_fhandle(const struct nfs_fh *fh, const char *caption) { } #endif /* * linux/fs/nfs/nfsroot.c */ extern int nfs_root_data(char **root_device, char **root_data); /*__init*/ /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ extern __be32 root_nfs_parse_addr(char *name); /*__init*/ /* * linux/fs/nfs/file.c */ extern const struct file_operations nfs_file_operations; #if IS_ENABLED(CONFIG_NFS_V4) extern const struct file_operations nfs4_file_operations; #endif /* CONFIG_NFS_V4 */ extern const struct address_space_operations nfs_file_aops; extern const struct address_space_operations nfs_dir_aops; static inline struct nfs_open_context *nfs_file_open_context(struct file *filp) { return filp->private_data; } static inline const struct cred *nfs_file_cred(struct file *file) { if (file != NULL) { struct nfs_open_context *ctx = nfs_file_open_context(file); if (ctx) return ctx->cred; } return NULL; } /* * linux/fs/nfs/direct.c */ int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter); ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, bool swap); ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, bool swap); /* * linux/fs/nfs/dir.c */ extern const struct file_operations nfs_dir_operations; extern const struct dentry_operations nfs_dentry_operations; extern void nfs_force_lookup_revalidate(struct inode *dir); extern void nfs_set_verifier(struct dentry * dentry, unsigned long verf); #if IS_ENABLED(CONFIG_NFS_V4) extern void nfs_clear_verifier_delegated(struct inode *inode); #endif /* IS_ENABLED(CONFIG_NFS_V4) */ extern struct dentry *nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr); extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr); extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags); extern void nfs_access_zap_cache(struct inode *inode); extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block); extern int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, struct file *file, unsigned int open_flags, umode_t mode); /* * linux/fs/nfs/symlink.c */ extern const struct inode_operations nfs_symlink_inode_operations; /* * linux/fs/nfs/sysctl.c */ #ifdef CONFIG_SYSCTL extern int nfs_register_sysctl(void); extern void nfs_unregister_sysctl(void); #else #define nfs_register_sysctl() 0 #define nfs_unregister_sysctl() do { } while(0) #endif /* * linux/fs/nfs/namespace.c */ extern const struct inode_operations nfs_mountpoint_inode_operations; extern const struct inode_operations nfs_referral_inode_operations; extern int nfs_mountpoint_expiry_timeout; extern void nfs_release_automount_timer(void); /* * linux/fs/nfs/unlink.c */ extern void nfs_complete_unlink(struct dentry *dentry, struct inode *); /* * linux/fs/nfs/write.c */ extern int nfs_congestion_kb; extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct folio *folio); extern int nfs_update_folio(struct file *file, struct folio *folio, unsigned int offset, unsigned int count); /* * Try to write back everything synchronously (but check the * return value!) */ extern int nfs_sync_inode(struct inode *inode); extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_folio(struct inode *inode, struct folio *folio); int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio); extern int nfs_commit_inode(struct inode *, int); extern struct nfs_commit_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_commit_data *data); void nfs_commit_begin(struct nfs_mds_commit_info *cinfo); bool nfs_commit_end(struct nfs_mds_commit_info *cinfo); static inline bool nfs_have_writebacks(const struct inode *inode) { if (S_ISREG(inode->i_mode)) return atomic_long_read(&NFS_I(inode)->nrequests) != 0; return false; } /* * linux/fs/nfs/read.c */ int nfs_read_folio(struct file *, struct folio *); void nfs_readahead(struct readahead_control *); /* * inline functions */ static inline loff_t nfs_size_to_loff_t(__u64 size) { return min_t(u64, size, OFFSET_MAX); } static inline ino_t nfs_fileid_to_ino_t(u64 fileid) { ino_t ino = (ino_t) fileid; if (sizeof(ino_t) < sizeof(u64)) ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8; return ino; } static inline void nfs_ooo_clear(struct nfs_inode *nfsi) { nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER; kfree(nfsi->ooo); nfsi->ooo = NULL; } static inline bool nfs_ooo_test(struct nfs_inode *nfsi) { return (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER) || (nfsi->ooo && nfsi->ooo->cnt > 0); } #define NFS_JUKEBOX_RETRY_TIME (5 * HZ) /* We need to block new opens while a file is being unlinked. * If it is opened *before* we decide to unlink, we will silly-rename * instead. If it is opened *after*, then we need to create or will fail. * If we allow the two to race, we could end up with a file that is open * but deleted on the server resulting in ESTALE. * So use ->d_fsdata to record when the unlink is happening * and block dentry revalidation while it is set. */ #define NFS_FSDATA_BLOCKED ((void*)1) # undef ifdebug # ifdef NFS_DEBUG # define ifdebug(fac) if (unlikely(nfs_debug & NFSDBG_##fac)) # define NFS_IFDEBUG(x) x # else # define ifdebug(fac) if (0) # define NFS_IFDEBUG(x) # endif #endif |
| 9 1 1 7 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | // SPDX-License-Identifier: GPL-2.0-only /* IP tables module for matching IPsec policy * * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/init.h> #include <net/xfrm.h> #include <linux/netfilter.h> #include <linux/netfilter/xt_policy.h> #include <linux/netfilter/x_tables.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Xtables: IPsec policy match"); MODULE_LICENSE("GPL"); static inline bool xt_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *m, const union nf_inet_addr *a2, unsigned short family) { switch (family) { case NFPROTO_IPV4: return ((a1->ip ^ a2->ip) & m->ip) == 0; case NFPROTO_IPV6: return ipv6_masked_addr_cmp(&a1->in6, &m->in6, &a2->in6) == 0; } return false; } static bool match_xfrm_state(const struct xfrm_state *x, const struct xt_policy_elem *e, unsigned short family) { #define MATCH_ADDR(x,y,z) (!e->match.x || \ (xt_addr_cmp(&e->x, &e->y, (const union nf_inet_addr *)(z), family) \ ^ e->invert.x)) #define MATCH(x,y) (!e->match.x || ((e->x == (y)) ^ e->invert.x)) return MATCH_ADDR(saddr, smask, &x->props.saddr) && MATCH_ADDR(daddr, dmask, &x->id.daddr) && MATCH(proto, x->id.proto) && MATCH(mode, x->props.mode) && MATCH(spi, x->id.spi) && MATCH(reqid, x->props.reqid); } static int match_policy_in(const struct sk_buff *skb, const struct xt_policy_info *info, unsigned short family) { const struct xt_policy_elem *e; const struct sec_path *sp = skb_sec_path(skb); int strict = info->flags & XT_POLICY_MATCH_STRICT; int i, pos; if (sp == NULL) return -1; if (strict && info->len != sp->len) return 0; for (i = sp->len - 1; i >= 0; i--) { pos = strict ? i - sp->len + 1 : 0; if (pos >= info->len) return 0; e = &info->pol[pos]; if (match_xfrm_state(sp->xvec[i], e, family)) { if (!strict) return 1; } else if (strict) return 0; } return strict ? 1 : 0; } static int match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info, unsigned short family) { const struct xt_policy_elem *e; const struct dst_entry *dst = skb_dst(skb); int strict = info->flags & XT_POLICY_MATCH_STRICT; int i, pos; if (dst->xfrm == NULL) return -1; for (i = 0; dst && dst->xfrm; dst = ((struct xfrm_dst *)dst)->child, i++) { pos = strict ? i : 0; if (pos >= info->len) return 0; e = &info->pol[pos]; if (match_xfrm_state(dst->xfrm, e, family)) { if (!strict) return 1; } else if (strict) return 0; } return strict ? i == info->len : 0; } static bool policy_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_policy_info *info = par->matchinfo; int ret; if (info->flags & XT_POLICY_MATCH_IN) ret = match_policy_in(skb, info, xt_family(par)); else ret = match_policy_out(skb, info, xt_family(par)); if (ret < 0) ret = info->flags & XT_POLICY_MATCH_NONE ? true : false; else if (info->flags & XT_POLICY_MATCH_NONE) ret = false; return ret; } static int policy_mt_check(const struct xt_mtchk_param *par) { const struct xt_policy_info *info = par->matchinfo; const char *errmsg = "neither incoming nor outgoing policy selected"; if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) goto err; if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) { errmsg = "output policy not valid in PREROUTING and INPUT"; goto err; } if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_POLICY_MATCH_IN) { errmsg = "input policy not valid in POSTROUTING and OUTPUT"; goto err; } if (info->len > XT_POLICY_MAX_ELEM) { errmsg = "too many policy elements"; goto err; } return 0; err: pr_info_ratelimited("%s\n", errmsg); return -EINVAL; } static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", .family = NFPROTO_IPV4, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), .me = THIS_MODULE, }, { .name = "policy", .family = NFPROTO_IPV6, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), .me = THIS_MODULE, }, }; static int __init policy_mt_init(void) { return xt_register_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg)); } static void __exit policy_mt_exit(void) { xt_unregister_matches(policy_mt_reg, ARRAY_SIZE(policy_mt_reg)); } module_init(policy_mt_init); module_exit(policy_mt_exit); MODULE_ALIAS("ipt_policy"); MODULE_ALIAS("ip6t_policy"); |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 | // SPDX-License-Identifier: GPL-2.0-or-later /* * T613 subdriver * * Copyright (C) 2010 Jean-Francois Moine (http://moinejf.free.fr) * *Notes: * t613 + tas5130A * * Focus to light do not balance well as in win. * Quality in win is not good, but its kinda better. * * Fix some "extraneous bytes", most of apps will show the image anyway * * Gamma table, is there, but its really doing something? * * 7~8 Fps, its ok, max on win its 10. * Costantino Leandro */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define MODULE_NAME "t613" #include <linux/input.h> #include <linux/slab.h> #include "gspca.h" MODULE_AUTHOR("Leandro Costantino <le_costantino@pixartargentina.com.ar>"); MODULE_DESCRIPTION("GSPCA/T613 (JPEG Compliance) USB Camera Driver"); MODULE_LICENSE("GPL"); struct sd { struct gspca_dev gspca_dev; /* !! must be the first item */ struct v4l2_ctrl *freq; struct { /* awb / color gains control cluster */ struct v4l2_ctrl *awb; struct v4l2_ctrl *gain; struct v4l2_ctrl *red_balance; struct v4l2_ctrl *blue_balance; }; u8 sensor; u8 button_pressed; }; enum sensors { SENSOR_OM6802, SENSOR_OTHER, SENSOR_TAS5130A, SENSOR_LT168G, /* must verify if this is the actual model */ }; static const struct v4l2_pix_format vga_mode_t16[] = { {160, 120, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE, .bytesperline = 160, .sizeimage = 160 * 120 * 4 / 8 + 590, .colorspace = V4L2_COLORSPACE_JPEG, .priv = 4}, #if 0 /* HDG: broken with my test cam, so lets disable it */ {176, 144, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE, .bytesperline = 176, .sizeimage = 176 * 144 * 3 / 8 + 590, .colorspace = V4L2_COLORSPACE_JPEG, .priv = 3}, #endif {320, 240, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE, .bytesperline = 320, .sizeimage = 320 * 240 * 3 / 8 + 590, .colorspace = V4L2_COLORSPACE_JPEG, .priv = 2}, #if 0 /* HDG: broken with my test cam, so lets disable it */ {352, 288, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE, .bytesperline = 352, .sizeimage = 352 * 288 * 3 / 8 + 590, .colorspace = V4L2_COLORSPACE_JPEG, .priv = 1}, #endif {640, 480, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE, .bytesperline = 640, .sizeimage = 640 * 480 * 3 / 8 + 590, .colorspace = V4L2_COLORSPACE_JPEG, .priv = 0}, }; /* sensor specific data */ struct additional_sensor_data { const u8 n3[6]; const u8 *n4, n4sz; const u8 reg80, reg8e; const u8 nset8[6]; const u8 data1[10]; const u8 data2[9]; const u8 data3[9]; const u8 data5[6]; const u8 stream[4]; }; static const u8 n4_om6802[] = { 0x09, 0x01, 0x12, 0x04, 0x66, 0x8a, 0x80, 0x3c, 0x81, 0x22, 0x84, 0x50, 0x8a, 0x78, 0x8b, 0x68, 0x8c, 0x88, 0x8e, 0x33, 0x8f, 0x24, 0xaa, 0xb1, 0xa2, 0x60, 0xa5, 0x30, 0xa6, 0x3a, 0xa8, 0xe8, 0xae, 0x05, 0xb1, 0x00, 0xbb, 0x04, 0xbc, 0x48, 0xbe, 0x36, 0xc6, 0x88, 0xe9, 0x00, 0xc5, 0xc0, 0x65, 0x0a, 0xbb, 0x86, 0xaf, 0x58, 0xb0, 0x68, 0x87, 0x40, 0x89, 0x2b, 0x8d, 0xff, 0x83, 0x40, 0xac, 0x84, 0xad, 0x86, 0xaf, 0x46 }; static const u8 n4_other[] = { 0x66, 0x00, 0x7f, 0x00, 0x80, 0xac, 0x81, 0x69, 0x84, 0x40, 0x85, 0x70, 0x86, 0x20, 0x8a, 0x68, 0x8b, 0x58, 0x8c, 0x88, 0x8d, 0xff, 0x8e, 0xb8, 0x8f, 0x28, 0xa2, 0x60, 0xa5, 0x40, 0xa8, 0xa8, 0xac, 0x84, 0xad, 0x84, 0xae, 0x24, 0xaf, 0x56, 0xb0, 0x68, 0xb1, 0x00, 0xb2, 0x88, 0xbb, 0xc5, 0xbc, 0x4a, 0xbe, 0x36, 0xc2, 0x88, 0xc5, 0xc0, 0xc6, 0xda, 0xe9, 0x26, 0xeb, 0x00 }; static const u8 n4_tas5130a[] = { 0x80, 0x3c, 0x81, 0x68, 0x83, 0xa0, 0x84, 0x20, 0x8a, 0x68, 0x8b, 0x58, 0x8c, 0x88, 0x8e, 0xb4, 0x8f, 0x24, 0xa1, 0xb1, 0xa2, 0x30, 0xa5, 0x10, 0xa6, 0x4a, 0xae, 0x03, 0xb1, 0x44, 0xb2, 0x08, 0xb7, 0x06, 0xb9, 0xe7, 0xbb, 0xc4, 0xbc, 0x4a, 0xbe, 0x36, 0xbf, 0xff, 0xc2, 0x88, 0xc5, 0xc8, 0xc6, 0xda }; static const u8 n4_lt168g[] = { 0x66, 0x01, 0x7f, 0x00, 0x80, 0x7c, 0x81, 0x28, 0x83, 0x44, 0x84, 0x20, 0x86, 0x20, 0x8a, 0x70, 0x8b, 0x58, 0x8c, 0x88, 0x8d, 0xa0, 0x8e, 0xb3, 0x8f, 0x24, 0xa1, 0xb0, 0xa2, 0x38, 0xa5, 0x20, 0xa6, 0x4a, 0xa8, 0xe8, 0xaf, 0x38, 0xb0, 0x68, 0xb1, 0x44, 0xb2, 0x88, 0xbb, 0x86, 0xbd, 0x40, 0xbe, 0x26, 0xc1, 0x05, 0xc2, 0x88, 0xc5, 0xc0, 0xda, 0x8e, 0xdb, 0xca, 0xdc, 0xa8, 0xdd, 0x8c, 0xde, 0x44, 0xdf, 0x0c, 0xe9, 0x80 }; static const struct additional_sensor_data sensor_data[] = { [SENSOR_OM6802] = { .n3 = {0x61, 0x68, 0x65, 0x0a, 0x60, 0x04}, .n4 = n4_om6802, .n4sz = sizeof n4_om6802, .reg80 = 0x3c, .reg8e = 0x33, .nset8 = {0xa8, 0xf0, 0xc6, 0x88, 0xc0, 0x00}, .data1 = {0xc2, 0x28, 0x0f, 0x22, 0xcd, 0x27, 0x2c, 0x06, 0xb3, 0xfc}, .data2 = {0x80, 0xff, 0xff, 0x80, 0xff, 0xff, 0x80, 0xff, 0xff}, .data3 = {0x80, 0xff, 0xff, 0x80, 0xff, 0xff, 0x80, 0xff, 0xff}, .data5 = /* this could be removed later */ {0x0c, 0x03, 0xab, 0x13, 0x81, 0x23}, .stream = {0x0b, 0x04, 0x0a, 0x78}, }, [SENSOR_OTHER] = { .n3 = {0x61, 0xc2, 0x65, 0x88, 0x60, 0x00}, .n4 = n4_other, .n4sz = sizeof n4_other, .reg80 = 0xac, .reg8e = 0xb8, .nset8 = {0xa8, 0xa8, 0xc6, 0xda, 0xc0, 0x00}, .data1 = {0xc1, 0x48, 0x04, 0x1b, 0xca, 0x2e, 0x33, 0x3a, 0xe8, 0xfc}, .data2 = {0x4e, 0x9c, 0xec, 0x40, 0x80, 0xc0, 0x48, 0x96, 0xd9}, .data3 = {0x4e, 0x9c, 0xec, 0x40, 0x80, 0xc0, 0x48, 0x96, 0xd9}, .data5 = {0x0c, 0x03, 0xab, 0x29, 0x81, 0x69}, .stream = {0x0b, 0x04, 0x0a, 0x00}, }, [SENSOR_TAS5130A] = { .n3 = {0x61, 0xc2, 0x65, 0x0d, 0x60, 0x08}, .n4 = n4_tas5130a, .n4sz = sizeof n4_tas5130a, .reg80 = 0x3c, .reg8e = 0xb4, .nset8 = {0xa8, 0xf0, 0xc6, 0xda, 0xc0, 0x00}, .data1 = {0xbb, 0x28, 0x10, 0x10, 0xbb, 0x28, 0x1e, 0x27, 0xc8, 0xfc}, .data2 = {0x60, 0xa8, 0xe0, 0x60, 0xa8, 0xe0, 0x60, 0xa8, 0xe0}, .data3 = {0x60, 0xa8, 0xe0, 0x60, 0xa8, 0xe0, 0x60, 0xa8, 0xe0}, .data5 = {0x0c, 0x03, 0xab, 0x10, 0x81, 0x20}, .stream = {0x0b, 0x04, 0x0a, 0x40}, }, [SENSOR_LT168G] = { .n3 = {0x61, 0xc2, 0x65, 0x68, 0x60, 0x00}, .n4 = n4_lt168g, .n4sz = sizeof n4_lt168g, .reg80 = 0x7c, .reg8e = 0xb3, .nset8 = {0xa8, 0xf0, 0xc6, 0xba, 0xc0, 0x00}, .data1 = {0xc0, 0x38, 0x08, 0x10, 0xc0, 0x30, 0x10, 0x40, 0xb0, 0xf4}, .data2 = {0x40, 0x80, 0xc0, 0x50, 0xa0, 0xf0, 0x53, 0xa6, 0xff}, .data3 = {0x40, 0x80, 0xc0, 0x50, 0xa0, 0xf0, 0x53, 0xa6, 0xff}, .data5 = {0x0c, 0x03, 0xab, 0x4b, 0x81, 0x2b}, .stream = {0x0b, 0x04, 0x0a, 0x28}, }, }; #define MAX_EFFECTS 7 static const u8 effects_table[MAX_EFFECTS][6] = { {0xa8, 0xe8, 0xc6, 0xd2, 0xc0, 0x00}, /* Normal */ {0xa8, 0xc8, 0xc6, 0x52, 0xc0, 0x04}, /* Repujar */ {0xa8, 0xe8, 0xc6, 0xd2, 0xc0, 0x20}, /* Monochrome */ {0xa8, 0xe8, 0xc6, 0xd2, 0xc0, 0x80}, /* Sepia */ {0xa8, 0xc8, 0xc6, 0x52, 0xc0, 0x02}, /* Croquis */ {0xa8, 0xc8, 0xc6, 0xd2, 0xc0, 0x10}, /* Sun Effect */ {0xa8, 0xc8, 0xc6, 0xd2, 0xc0, 0x40}, /* Negative */ }; #define GAMMA_MAX (15) static const u8 gamma_table[GAMMA_MAX+1][17] = { /* gamma table from cam1690.ini */ {0x00, 0x00, 0x01, 0x04, 0x08, 0x0e, 0x16, 0x21, /* 0 */ 0x2e, 0x3d, 0x50, 0x65, 0x7d, 0x99, 0xb8, 0xdb, 0xff}, {0x00, 0x01, 0x03, 0x08, 0x0e, 0x16, 0x21, 0x2d, /* 1 */ 0x3c, 0x4d, 0x60, 0x75, 0x8d, 0xa6, 0xc2, 0xe1, 0xff}, {0x00, 0x01, 0x05, 0x0b, 0x12, 0x1c, 0x28, 0x35, /* 2 */ 0x45, 0x56, 0x69, 0x7e, 0x95, 0xad, 0xc7, 0xe3, 0xff}, {0x00, 0x02, 0x07, 0x0f, 0x18, 0x24, 0x30, 0x3f, /* 3 */ 0x4f, 0x61, 0x73, 0x88, 0x9d, 0xb4, 0xcd, 0xe6, 0xff}, {0x00, 0x04, 0x0b, 0x15, 0x20, 0x2d, 0x3b, 0x4a, /* 4 */ 0x5b, 0x6c, 0x7f, 0x92, 0xa7, 0xbc, 0xd2, 0xe9, 0xff}, {0x00, 0x07, 0x11, 0x15, 0x20, 0x2d, 0x48, 0x58, /* 5 */ 0x68, 0x79, 0x8b, 0x9d, 0xb0, 0xc4, 0xd7, 0xec, 0xff}, {0x00, 0x0c, 0x1a, 0x29, 0x38, 0x47, 0x57, 0x67, /* 6 */ 0x77, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff}, {0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, /* 7 */ 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0, 0xff}, {0x00, 0x15, 0x27, 0x38, 0x49, 0x59, 0x69, 0x79, /* 8 */ 0x88, 0x97, 0xa7, 0xb6, 0xc4, 0xd3, 0xe2, 0xf0, 0xff}, {0x00, 0x1c, 0x30, 0x43, 0x54, 0x65, 0x75, 0x84, /* 9 */ 0x93, 0xa1, 0xb0, 0xbd, 0xca, 0xd8, 0xe5, 0xf2, 0xff}, {0x00, 0x24, 0x3b, 0x4f, 0x60, 0x70, 0x80, 0x8e, /* 10 */ 0x9c, 0xaa, 0xb7, 0xc4, 0xd0, 0xdc, 0xe8, 0xf3, 0xff}, {0x00, 0x2a, 0x3c, 0x5d, 0x6e, 0x7e, 0x8d, 0x9b, /* 11 */ 0xa8, 0xb4, 0xc0, 0xcb, 0xd6, 0xe1, 0xeb, 0xf5, 0xff}, {0x00, 0x3f, 0x5a, 0x6e, 0x7f, 0x8e, 0x9c, 0xa8, /* 12 */ 0xb4, 0xbf, 0xc9, 0xd3, 0xdc, 0xe5, 0xee, 0xf6, 0xff}, {0x00, 0x54, 0x6f, 0x83, 0x93, 0xa0, 0xad, 0xb7, /* 13 */ 0xc2, 0xcb, 0xd4, 0xdc, 0xe4, 0xeb, 0xf2, 0xf9, 0xff}, {0x00, 0x6e, 0x88, 0x9a, 0xa8, 0xb3, 0xbd, 0xc6, /* 14 */ 0xcf, 0xd6, 0xdd, 0xe3, 0xe9, 0xef, 0xf4, 0xfa, 0xff}, {0x00, 0x93, 0xa8, 0xb7, 0xc1, 0xca, 0xd2, 0xd8, /* 15 */ 0xde, 0xe3, 0xe8, 0xed, 0xf1, 0xf5, 0xf8, 0xfc, 0xff} }; static const u8 tas5130a_sensor_init[][8] = { {0x62, 0x08, 0x63, 0x70, 0x64, 0x1d, 0x60, 0x09}, {0x62, 0x20, 0x63, 0x01, 0x64, 0x02, 0x60, 0x09}, {0x62, 0x07, 0x63, 0x03, 0x64, 0x00, 0x60, 0x09}, }; static u8 sensor_reset[] = {0x61, 0x68, 0x62, 0xff, 0x60, 0x07}; /* read 1 byte */ static u8 reg_r(struct gspca_dev *gspca_dev, u16 index) { usb_control_msg(gspca_dev->dev, usb_rcvctrlpipe(gspca_dev->dev, 0), 0, /* request */ USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, /* value */ index, gspca_dev->usb_buf, 1, 500); return gspca_dev->usb_buf[0]; } static void reg_w(struct gspca_dev *gspca_dev, u16 index) { usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, index, NULL, 0, 500); } static void reg_w_buf(struct gspca_dev *gspca_dev, const u8 *buffer, u16 len) { if (len <= USB_BUF_SZ) { memcpy(gspca_dev->usb_buf, buffer, len); usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0x01, 0, gspca_dev->usb_buf, len, 500); } else { u8 *tmpbuf; tmpbuf = kmemdup(buffer, len, GFP_KERNEL); if (!tmpbuf) { pr_err("Out of memory\n"); return; } usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0x01, 0, tmpbuf, len, 500); kfree(tmpbuf); } } /* write values to consecutive registers */ static void reg_w_ixbuf(struct gspca_dev *gspca_dev, u8 reg, const u8 *buffer, u16 len) { int i; u8 *p, *tmpbuf; if (len * 2 <= USB_BUF_SZ) { p = tmpbuf = gspca_dev->usb_buf; } else { p = tmpbuf = kmalloc_array(len, 2, GFP_KERNEL); if (!tmpbuf) { pr_err("Out of memory\n"); return; } } i = len; while (--i >= 0) { *p++ = reg++; *p++ = *buffer++; } usb_control_msg(gspca_dev->dev, usb_sndctrlpipe(gspca_dev->dev, 0), 0, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0x01, 0, tmpbuf, len * 2, 500); if (len * 2 > USB_BUF_SZ) kfree(tmpbuf); } static void om6802_sensor_init(struct gspca_dev *gspca_dev) { int i; const u8 *p; u8 byte; u8 val[6] = {0x62, 0, 0x64, 0, 0x60, 0x05}; static const u8 sensor_init[] = { 0xdf, 0x6d, 0xdd, 0x18, 0x5a, 0xe0, 0x5c, 0x07, 0x5d, 0xb0, 0x5e, 0x1e, 0x60, 0x71, 0xef, 0x00, 0xe9, 0x00, 0xea, 0x00, 0x90, 0x24, 0x91, 0xb2, 0x82, 0x32, 0xfd, 0x41, 0x00 /* table end */ }; reg_w_buf(gspca_dev, sensor_reset, sizeof sensor_reset); msleep(100); i = 4; while (--i > 0) { byte = reg_r(gspca_dev, 0x0060); if (!(byte & 0x01)) break; msleep(100); } byte = reg_r(gspca_dev, 0x0063); if (byte != 0x17) { pr_err("Bad sensor reset %02x\n", byte); /* continue? */ } p = sensor_init; while (*p != 0) { val[1] = *p++; val[3] = *p++; if (*p == 0) reg_w(gspca_dev, 0x3c80); reg_w_buf(gspca_dev, val, sizeof val); i = 4; while (--i >= 0) { msleep(15); byte = reg_r(gspca_dev, 0x60); if (!(byte & 0x01)) break; } } msleep(15); reg_w(gspca_dev, 0x3c80); } /* this function is called at probe time */ static int sd_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id) { struct cam *cam = &gspca_dev->cam; cam->cam_mode = vga_mode_t16; cam->nmodes = ARRAY_SIZE(vga_mode_t16); return 0; } static void setbrightness(struct gspca_dev *gspca_dev, s32 brightness) { u8 set6[4] = { 0x8f, 0x24, 0xc3, 0x00 }; if (brightness < 7) { set6[1] = 0x26; set6[3] = 0x70 - brightness * 0x10; } else { set6[3] = 0x00 + ((brightness - 7) * 0x10); } reg_w_buf(gspca_dev, set6, sizeof set6); } static void setcontrast(struct gspca_dev *gspca_dev, s32 contrast) { u16 reg_to_write; if (contrast < 7) reg_to_write = 0x8ea9 - contrast * 0x200; else reg_to_write = 0x00a9 + (contrast - 7) * 0x200; reg_w(gspca_dev, reg_to_write); } static void setcolors(struct gspca_dev *gspca_dev, s32 val) { u16 reg_to_write; reg_to_write = 0x80bb + val * 0x100; /* was 0xc0 */ reg_w(gspca_dev, reg_to_write); } static void setgamma(struct gspca_dev *gspca_dev, s32 val) { gspca_dbg(gspca_dev, D_CONF, "Gamma: %d\n", val); reg_w_ixbuf(gspca_dev, 0x90, gamma_table[val], sizeof gamma_table[0]); } static void setawb_n_RGB(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; u8 all_gain_reg[8] = { 0x87, 0x00, 0x88, 0x00, 0x89, 0x00, 0x80, 0x00 }; s32 red_gain, blue_gain, green_gain; green_gain = sd->gain->val; red_gain = green_gain + sd->red_balance->val; if (red_gain > 0x40) red_gain = 0x40; else if (red_gain < 0x10) red_gain = 0x10; blue_gain = green_gain + sd->blue_balance->val; if (blue_gain > 0x40) blue_gain = 0x40; else if (blue_gain < 0x10) blue_gain = 0x10; all_gain_reg[1] = red_gain; all_gain_reg[3] = blue_gain; all_gain_reg[5] = green_gain; all_gain_reg[7] = sensor_data[sd->sensor].reg80; if (!sd->awb->val) all_gain_reg[7] &= ~0x04; /* AWB off */ reg_w_buf(gspca_dev, all_gain_reg, sizeof all_gain_reg); } static void setsharpness(struct gspca_dev *gspca_dev, s32 val) { u16 reg_to_write; reg_to_write = 0x0aa6 + 0x1000 * val; reg_w(gspca_dev, reg_to_write); } static void setfreq(struct gspca_dev *gspca_dev, s32 val) { struct sd *sd = (struct sd *) gspca_dev; u8 reg66; u8 freq[4] = { 0x66, 0x00, 0xa8, 0xe8 }; switch (sd->sensor) { case SENSOR_LT168G: if (val != 0) freq[3] = 0xa8; reg66 = 0x41; break; case SENSOR_OM6802: reg66 = 0xca; break; default: reg66 = 0x40; break; } switch (val) { case 0: /* no flicker */ freq[3] = 0xf0; break; case 2: /* 60Hz */ reg66 &= ~0x40; break; } freq[1] = reg66; reg_w_buf(gspca_dev, freq, sizeof freq); } /* this function is called at probe and resume time */ static int sd_init(struct gspca_dev *gspca_dev) { /* some of this registers are not really needed, because * they are overridden by setbrigthness, setcontrast, etc., * but won't hurt anyway, and can help someone with similar webcam * to see the initial parameters.*/ struct sd *sd = (struct sd *) gspca_dev; const struct additional_sensor_data *sensor; int i; u16 sensor_id; u8 test_byte = 0; static const u8 read_indexs[] = { 0x0a, 0x0b, 0x66, 0x80, 0x81, 0x8e, 0x8f, 0xa5, 0xa6, 0xa8, 0xbb, 0xbc, 0xc6, 0x00 }; static const u8 n1[] = {0x08, 0x03, 0x09, 0x03, 0x12, 0x04}; static const u8 n2[] = {0x08, 0x00}; sensor_id = (reg_r(gspca_dev, 0x06) << 8) | reg_r(gspca_dev, 0x07); switch (sensor_id & 0xff0f) { case 0x0801: gspca_dbg(gspca_dev, D_PROBE, "sensor tas5130a\n"); sd->sensor = SENSOR_TAS5130A; break; case 0x0802: gspca_dbg(gspca_dev, D_PROBE, "sensor lt168g\n"); sd->sensor = SENSOR_LT168G; break; case 0x0803: gspca_dbg(gspca_dev, D_PROBE, "sensor 'other'\n"); sd->sensor = SENSOR_OTHER; break; case 0x0807: gspca_dbg(gspca_dev, D_PROBE, "sensor om6802\n"); sd->sensor = SENSOR_OM6802; break; default: pr_err("unknown sensor %04x\n", sensor_id); return -EINVAL; } if (sd->sensor == SENSOR_OM6802) { reg_w_buf(gspca_dev, n1, sizeof n1); i = 5; while (--i >= 0) { reg_w_buf(gspca_dev, sensor_reset, sizeof sensor_reset); test_byte = reg_r(gspca_dev, 0x0063); msleep(100); if (test_byte == 0x17) break; /* OK */ } if (i < 0) { pr_err("Bad sensor reset %02x\n", test_byte); return -EIO; } reg_w_buf(gspca_dev, n2, sizeof n2); } i = 0; while (read_indexs[i] != 0x00) { test_byte = reg_r(gspca_dev, read_indexs[i]); gspca_dbg(gspca_dev, D_STREAM, "Reg 0x%02x = 0x%02x\n", read_indexs[i], test_byte); i++; } sensor = &sensor_data[sd->sensor]; reg_w_buf(gspca_dev, sensor->n3, sizeof sensor->n3); reg_w_buf(gspca_dev, sensor->n4, sensor->n4sz); if (sd->sensor == SENSOR_LT168G) { test_byte = reg_r(gspca_dev, 0x80); gspca_dbg(gspca_dev, D_STREAM, "Reg 0x%02x = 0x%02x\n", 0x80, test_byte); reg_w(gspca_dev, 0x6c80); } reg_w_ixbuf(gspca_dev, 0xd0, sensor->data1, sizeof sensor->data1); reg_w_ixbuf(gspca_dev, 0xc7, sensor->data2, sizeof sensor->data2); reg_w_ixbuf(gspca_dev, 0xe0, sensor->data3, sizeof sensor->data3); reg_w(gspca_dev, (sensor->reg80 << 8) + 0x80); reg_w(gspca_dev, (sensor->reg80 << 8) + 0x80); reg_w(gspca_dev, (sensor->reg8e << 8) + 0x8e); reg_w(gspca_dev, (0x20 << 8) + 0x87); reg_w(gspca_dev, (0x20 << 8) + 0x88); reg_w(gspca_dev, (0x20 << 8) + 0x89); reg_w_buf(gspca_dev, sensor->data5, sizeof sensor->data5); reg_w_buf(gspca_dev, sensor->nset8, sizeof sensor->nset8); reg_w_buf(gspca_dev, sensor->stream, sizeof sensor->stream); if (sd->sensor == SENSOR_LT168G) { test_byte = reg_r(gspca_dev, 0x80); gspca_dbg(gspca_dev, D_STREAM, "Reg 0x%02x = 0x%02x\n", 0x80, test_byte); reg_w(gspca_dev, 0x6c80); } reg_w_ixbuf(gspca_dev, 0xd0, sensor->data1, sizeof sensor->data1); reg_w_ixbuf(gspca_dev, 0xc7, sensor->data2, sizeof sensor->data2); reg_w_ixbuf(gspca_dev, 0xe0, sensor->data3, sizeof sensor->data3); return 0; } static void setmirror(struct gspca_dev *gspca_dev, s32 val) { u8 hflipcmd[8] = {0x62, 0x07, 0x63, 0x03, 0x64, 0x00, 0x60, 0x09}; if (val) hflipcmd[3] = 0x01; reg_w_buf(gspca_dev, hflipcmd, sizeof hflipcmd); } static void seteffect(struct gspca_dev *gspca_dev, s32 val) { int idx = 0; switch (val) { case V4L2_COLORFX_NONE: break; case V4L2_COLORFX_BW: idx = 2; break; case V4L2_COLORFX_SEPIA: idx = 3; break; case V4L2_COLORFX_SKETCH: idx = 4; break; case V4L2_COLORFX_NEGATIVE: idx = 6; break; default: break; } reg_w_buf(gspca_dev, effects_table[idx], sizeof effects_table[0]); if (val == V4L2_COLORFX_SKETCH) reg_w(gspca_dev, 0x4aa6); else reg_w(gspca_dev, 0xfaa6); } /* Is this really needed? * i added some module parameters for test with some users */ static void poll_sensor(struct gspca_dev *gspca_dev) { static const u8 poll1[] = {0x67, 0x05, 0x68, 0x81, 0x69, 0x80, 0x6a, 0x82, 0x6b, 0x68, 0x6c, 0x69, 0x72, 0xd9, 0x73, 0x34, 0x74, 0x32, 0x75, 0x92, 0x76, 0x00, 0x09, 0x01, 0x60, 0x14}; static const u8 poll2[] = {0x67, 0x02, 0x68, 0x71, 0x69, 0x72, 0x72, 0xa9, 0x73, 0x02, 0x73, 0x02, 0x60, 0x14}; static const u8 noise03[] = /* (some differences / ms-drv) */ {0xa6, 0x0a, 0xea, 0xcf, 0xbe, 0x26, 0xb1, 0x5f, 0xa1, 0xb1, 0xda, 0x6b, 0xdb, 0x98, 0xdf, 0x0c, 0xc2, 0x80, 0xc3, 0x10}; gspca_dbg(gspca_dev, D_STREAM, "[Sensor requires polling]\n"); reg_w_buf(gspca_dev, poll1, sizeof poll1); reg_w_buf(gspca_dev, poll2, sizeof poll2); reg_w_buf(gspca_dev, noise03, sizeof noise03); } static int sd_start(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; const struct additional_sensor_data *sensor; int i, mode; u8 t2[] = { 0x07, 0x00, 0x0d, 0x60, 0x0e, 0x80 }; static const u8 t3[] = { 0x07, 0x00, 0x88, 0x02, 0x06, 0x00, 0xe7, 0x01 }; mode = gspca_dev->cam.cam_mode[gspca_dev->curr_mode].priv; switch (mode) { case 0: /* 640x480 (0x00) */ break; case 1: /* 352x288 */ t2[1] = 0x40; break; case 2: /* 320x240 */ t2[1] = 0x10; break; case 3: /* 176x144 */ t2[1] = 0x50; break; default: /* case 4: * 160x120 */ t2[1] = 0x20; break; } switch (sd->sensor) { case SENSOR_OM6802: om6802_sensor_init(gspca_dev); break; case SENSOR_TAS5130A: i = 0; for (;;) { reg_w_buf(gspca_dev, tas5130a_sensor_init[i], sizeof tas5130a_sensor_init[0]); if (i >= ARRAY_SIZE(tas5130a_sensor_init) - 1) break; i++; } reg_w(gspca_dev, 0x3c80); /* just in case and to keep sync with logs (for mine) */ reg_w_buf(gspca_dev, tas5130a_sensor_init[i], sizeof tas5130a_sensor_init[0]); reg_w(gspca_dev, 0x3c80); break; } sensor = &sensor_data[sd->sensor]; setfreq(gspca_dev, v4l2_ctrl_g_ctrl(sd->freq)); reg_r(gspca_dev, 0x0012); reg_w_buf(gspca_dev, t2, sizeof t2); reg_w_ixbuf(gspca_dev, 0xb3, t3, sizeof t3); reg_w(gspca_dev, 0x0013); msleep(15); reg_w_buf(gspca_dev, sensor->stream, sizeof sensor->stream); reg_w_buf(gspca_dev, sensor->stream, sizeof sensor->stream); if (sd->sensor == SENSOR_OM6802) poll_sensor(gspca_dev); return 0; } static void sd_stopN(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; reg_w_buf(gspca_dev, sensor_data[sd->sensor].stream, sizeof sensor_data[sd->sensor].stream); reg_w_buf(gspca_dev, sensor_data[sd->sensor].stream, sizeof sensor_data[sd->sensor].stream); if (sd->sensor == SENSOR_OM6802) { msleep(20); reg_w(gspca_dev, 0x0309); } #if IS_ENABLED(CONFIG_INPUT) /* If the last button state is pressed, release it now! */ if (sd->button_pressed) { input_report_key(gspca_dev->input_dev, KEY_CAMERA, 0); input_sync(gspca_dev->input_dev); sd->button_pressed = 0; } #endif } static void sd_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* isoc packet */ int len) /* iso packet length */ { struct sd *sd __maybe_unused = (struct sd *) gspca_dev; int pkt_type; if (data[0] == 0x5a) { #if IS_ENABLED(CONFIG_INPUT) if (len > 20) { u8 state = (data[20] & 0x80) ? 1 : 0; if (sd->button_pressed != state) { input_report_key(gspca_dev->input_dev, KEY_CAMERA, state); input_sync(gspca_dev->input_dev); sd->button_pressed = state; } } #endif /* Control Packet, after this came the header again, * but extra bytes came in the packet before this, * sometimes an EOF arrives, sometimes not... */ return; } data += 2; len -= 2; if (data[0] == 0xff && data[1] == 0xd8) pkt_type = FIRST_PACKET; else if (data[len - 2] == 0xff && data[len - 1] == 0xd9) pkt_type = LAST_PACKET; else pkt_type = INTER_PACKET; gspca_frame_add(gspca_dev, pkt_type, data, len); } static int sd_g_volatile_ctrl(struct v4l2_ctrl *ctrl) { struct gspca_dev *gspca_dev = container_of(ctrl->handler, struct gspca_dev, ctrl_handler); struct sd *sd = (struct sd *)gspca_dev; s32 red_gain, blue_gain, green_gain; gspca_dev->usb_err = 0; switch (ctrl->id) { case V4L2_CID_AUTO_WHITE_BALANCE: red_gain = reg_r(gspca_dev, 0x0087); if (red_gain > 0x40) red_gain = 0x40; else if (red_gain < 0x10) red_gain = 0x10; blue_gain = reg_r(gspca_dev, 0x0088); if (blue_gain > 0x40) blue_gain = 0x40; else if (blue_gain < 0x10) blue_gain = 0x10; green_gain = reg_r(gspca_dev, 0x0089); if (green_gain > 0x40) green_gain = 0x40; else if (green_gain < 0x10) green_gain = 0x10; sd->gain->val = green_gain; sd->red_balance->val = red_gain - green_gain; sd->blue_balance->val = blue_gain - green_gain; break; } return 0; } static int sd_s_ctrl(struct v4l2_ctrl *ctrl) { struct gspca_dev *gspca_dev = container_of(ctrl->handler, struct gspca_dev, ctrl_handler); gspca_dev->usb_err = 0; if (!gspca_dev->streaming) return 0; switch (ctrl->id) { case V4L2_CID_BRIGHTNESS: setbrightness(gspca_dev, ctrl->val); break; case V4L2_CID_CONTRAST: setcontrast(gspca_dev, ctrl->val); break; case V4L2_CID_SATURATION: setcolors(gspca_dev, ctrl->val); break; case V4L2_CID_GAMMA: setgamma(gspca_dev, ctrl->val); break; case V4L2_CID_HFLIP: setmirror(gspca_dev, ctrl->val); break; case V4L2_CID_SHARPNESS: setsharpness(gspca_dev, ctrl->val); break; case V4L2_CID_POWER_LINE_FREQUENCY: setfreq(gspca_dev, ctrl->val); break; case V4L2_CID_BACKLIGHT_COMPENSATION: reg_w(gspca_dev, ctrl->val ? 0xf48e : 0xb48e); break; case V4L2_CID_AUTO_WHITE_BALANCE: setawb_n_RGB(gspca_dev); break; case V4L2_CID_COLORFX: seteffect(gspca_dev, ctrl->val); break; } return gspca_dev->usb_err; } static const struct v4l2_ctrl_ops sd_ctrl_ops = { .g_volatile_ctrl = sd_g_volatile_ctrl, .s_ctrl = sd_s_ctrl, }; static int sd_init_controls(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *)gspca_dev; struct v4l2_ctrl_handler *hdl = &gspca_dev->ctrl_handler; gspca_dev->vdev.ctrl_handler = hdl; v4l2_ctrl_handler_init(hdl, 12); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_BRIGHTNESS, 0, 14, 1, 8); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_CONTRAST, 0, 0x0d, 1, 7); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_SATURATION, 0, 0xf, 1, 5); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_GAMMA, 0, GAMMA_MAX, 1, 10); /* Activate lowlight, some apps don't bring up the backlight_compensation control) */ v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_BACKLIGHT_COMPENSATION, 0, 1, 1, 1); if (sd->sensor == SENSOR_TAS5130A) v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_HFLIP, 0, 1, 1, 0); sd->awb = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_AUTO_WHITE_BALANCE, 0, 1, 1, 1); sd->gain = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_GAIN, 0x10, 0x40, 1, 0x20); sd->blue_balance = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_BLUE_BALANCE, -0x30, 0x30, 1, 0); sd->red_balance = v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_RED_BALANCE, -0x30, 0x30, 1, 0); v4l2_ctrl_new_std(hdl, &sd_ctrl_ops, V4L2_CID_SHARPNESS, 0, 15, 1, 6); v4l2_ctrl_new_std_menu(hdl, &sd_ctrl_ops, V4L2_CID_COLORFX, V4L2_COLORFX_SKETCH, ~((1 << V4L2_COLORFX_NONE) | (1 << V4L2_COLORFX_BW) | (1 << V4L2_COLORFX_SEPIA) | (1 << V4L2_COLORFX_SKETCH) | (1 << V4L2_COLORFX_NEGATIVE)), V4L2_COLORFX_NONE); sd->freq = v4l2_ctrl_new_std_menu(hdl, &sd_ctrl_ops, V4L2_CID_POWER_LINE_FREQUENCY, V4L2_CID_POWER_LINE_FREQUENCY_60HZ, 1, V4L2_CID_POWER_LINE_FREQUENCY_50HZ); if (hdl->error) { pr_err("Could not initialize controls\n"); return hdl->error; } v4l2_ctrl_auto_cluster(4, &sd->awb, 0, true); return 0; } /* sub-driver description */ static const struct sd_desc sd_desc = { .name = MODULE_NAME, .config = sd_config, .init = sd_init, .init_controls = sd_init_controls, .start = sd_start, .stopN = sd_stopN, .pkt_scan = sd_pkt_scan, #if IS_ENABLED(CONFIG_INPUT) .other_input = 1, #endif }; /* -- module initialisation -- */ static const struct usb_device_id device_table[] = { {USB_DEVICE(0x17a1, 0x0128)}, {} }; MODULE_DEVICE_TABLE(usb, device_table); /* -- device connect -- */ static int sd_probe(struct usb_interface *intf, const struct usb_device_id *id) { return gspca_dev_probe(intf, id, &sd_desc, sizeof(struct sd), THIS_MODULE); } static struct usb_driver sd_driver = { .name = MODULE_NAME, .id_table = device_table, .probe = sd_probe, .disconnect = gspca_disconnect, #ifdef CONFIG_PM .suspend = gspca_suspend, .resume = gspca_resume, .reset_resume = gspca_resume, #endif }; module_usb_driver(sd_driver); |
| 57 80 3 56 7 55 14 4 4 3 2 3 2 3 3 7 7 4 7 3 3 3 3 78 78 3 79 79 78 2 80 80 79 79 79 4 78 93 90 93 7 86 93 4 285 283 1 2 30 30 24 6 34 28 1 2 21 8 5 15 7 334 20 3 17 20 20 34 34 34 34 34 8 91 90 1 4 81 3 7 12 21 21 10 11 8 5 13 2 12 11 4 33 33 33 1 23 27 1 5 5 4 1 3 9 2 1 1 1 1 1 1 1 8 1 1 2 1 9 16 1 1 12 1 1 26 14 14 14 4 3 1 2 7 1 1 6 4 2 3 4 11 2 5 13 1 2 2 1 3 2 1 3 3 2 1 3 3 2 7 1 22 1 8 1 9 4 14 26 28 1 3 6 3 4 1 2 1 1 2 4 4 1 9 1 2 1 1 2 3 2 1 1 197 2 2 194 2 194 334 153 334 201 379 380 381 381 334 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 | // SPDX-License-Identifier: GPL-2.0-only /* * Event char devices, giving access to raw input device events. * * Copyright (c) 1999-2002 Vojtech Pavlik */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define EVDEV_MINOR_BASE 64 #define EVDEV_MINORS 32 #define EVDEV_MIN_BUFFER_SIZE 64U #define EVDEV_BUF_PACKETS 8 #include <linux/poll.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/init.h> #include <linux/input/mt.h> #include <linux/major.h> #include <linux/device.h> #include <linux/cdev.h> #include "input-compat.h" struct evdev { int open; struct input_handle handle; struct evdev_client __rcu *grab; struct list_head client_list; spinlock_t client_lock; /* protects client_list */ struct mutex mutex; struct device dev; struct cdev cdev; bool exist; }; struct evdev_client { unsigned int head; unsigned int tail; unsigned int packet_head; /* [future] position of the first element of next packet */ spinlock_t buffer_lock; /* protects access to buffer, head and tail */ wait_queue_head_t wait; struct fasync_struct *fasync; struct evdev *evdev; struct list_head node; enum input_clock_type clk_type; bool revoked; unsigned long *evmasks[EV_CNT]; unsigned int bufsize; struct input_event buffer[] __counted_by(bufsize); }; static size_t evdev_get_mask_cnt(unsigned int type) { static const size_t counts[EV_CNT] = { /* EV_SYN==0 is EV_CNT, _not_ SYN_CNT, see EVIOCGBIT */ [EV_SYN] = EV_CNT, [EV_KEY] = KEY_CNT, [EV_REL] = REL_CNT, [EV_ABS] = ABS_CNT, [EV_MSC] = MSC_CNT, [EV_SW] = SW_CNT, [EV_LED] = LED_CNT, [EV_SND] = SND_CNT, [EV_FF] = FF_CNT, }; return (type < EV_CNT) ? counts[type] : 0; } /* requires the buffer lock to be held */ static bool __evdev_is_filtered(struct evdev_client *client, unsigned int type, unsigned int code) { unsigned long *mask; size_t cnt; /* EV_SYN and unknown codes are never filtered */ if (type == EV_SYN || type >= EV_CNT) return false; /* first test whether the type is filtered */ mask = client->evmasks[0]; if (mask && !test_bit(type, mask)) return true; /* unknown values are never filtered */ cnt = evdev_get_mask_cnt(type); if (!cnt || code >= cnt) return false; mask = client->evmasks[type]; return mask && !test_bit(code, mask); } /* flush queued events of type @type, caller must hold client->buffer_lock */ static void __evdev_flush_queue(struct evdev_client *client, unsigned int type) { unsigned int i, head, num; unsigned int mask = client->bufsize - 1; bool is_report; struct input_event *ev; BUG_ON(type == EV_SYN); head = client->tail; client->packet_head = client->tail; /* init to 1 so a leading SYN_REPORT will not be dropped */ num = 1; for (i = client->tail; i != client->head; i = (i + 1) & mask) { ev = &client->buffer[i]; is_report = ev->type == EV_SYN && ev->code == SYN_REPORT; if (ev->type == type) { /* drop matched entry */ continue; } else if (is_report && !num) { /* drop empty SYN_REPORT groups */ continue; } else if (head != i) { /* move entry to fill the gap */ client->buffer[head] = *ev; } num++; head = (head + 1) & mask; if (is_report) { num = 0; client->packet_head = head; } } client->head = head; } static void __evdev_queue_syn_dropped(struct evdev_client *client) { ktime_t *ev_time = input_get_timestamp(client->evdev->handle.dev); struct timespec64 ts = ktime_to_timespec64(ev_time[client->clk_type]); struct input_event ev; ev.input_event_sec = ts.tv_sec; ev.input_event_usec = ts.tv_nsec / NSEC_PER_USEC; ev.type = EV_SYN; ev.code = SYN_DROPPED; ev.value = 0; client->buffer[client->head++] = ev; client->head &= client->bufsize - 1; if (unlikely(client->head == client->tail)) { /* drop queue but keep our SYN_DROPPED event */ client->tail = (client->head - 1) & (client->bufsize - 1); client->packet_head = client->tail; } } static void evdev_queue_syn_dropped(struct evdev_client *client) { unsigned long flags; spin_lock_irqsave(&client->buffer_lock, flags); __evdev_queue_syn_dropped(client); spin_unlock_irqrestore(&client->buffer_lock, flags); } static int evdev_set_clk_type(struct evdev_client *client, unsigned int clkid) { unsigned long flags; enum input_clock_type clk_type; switch (clkid) { case CLOCK_REALTIME: clk_type = INPUT_CLK_REAL; break; case CLOCK_MONOTONIC: clk_type = INPUT_CLK_MONO; break; case CLOCK_BOOTTIME: clk_type = INPUT_CLK_BOOT; break; default: return -EINVAL; } if (client->clk_type != clk_type) { client->clk_type = clk_type; /* * Flush pending events and queue SYN_DROPPED event, * but only if the queue is not empty. */ spin_lock_irqsave(&client->buffer_lock, flags); if (client->head != client->tail) { client->packet_head = client->head = client->tail; __evdev_queue_syn_dropped(client); } spin_unlock_irqrestore(&client->buffer_lock, flags); } return 0; } static void __pass_event(struct evdev_client *client, const struct input_event *event) { client->buffer[client->head++] = *event; client->head &= client->bufsize - 1; if (unlikely(client->head == client->tail)) { /* * This effectively "drops" all unconsumed events, leaving * EV_SYN/SYN_DROPPED plus the newest event in the queue. */ client->tail = (client->head - 2) & (client->bufsize - 1); client->buffer[client->tail] = (struct input_event) { .input_event_sec = event->input_event_sec, .input_event_usec = event->input_event_usec, .type = EV_SYN, .code = SYN_DROPPED, .value = 0, }; client->packet_head = client->tail; } if (event->type == EV_SYN && event->code == SYN_REPORT) { client->packet_head = client->head; kill_fasync(&client->fasync, SIGIO, POLL_IN); } } static void evdev_pass_values(struct evdev_client *client, const struct input_value *vals, unsigned int count, ktime_t *ev_time) { const struct input_value *v; struct input_event event; struct timespec64 ts; bool wakeup = false; if (client->revoked) return; ts = ktime_to_timespec64(ev_time[client->clk_type]); event.input_event_sec = ts.tv_sec; event.input_event_usec = ts.tv_nsec / NSEC_PER_USEC; /* Interrupts are disabled, just acquire the lock. */ spin_lock(&client->buffer_lock); for (v = vals; v != vals + count; v++) { if (__evdev_is_filtered(client, v->type, v->code)) continue; if (v->type == EV_SYN && v->code == SYN_REPORT) { /* drop empty SYN_REPORT */ if (client->packet_head == client->head) continue; wakeup = true; } event.type = v->type; event.code = v->code; event.value = v->value; __pass_event(client, &event); } spin_unlock(&client->buffer_lock); if (wakeup) wake_up_interruptible_poll(&client->wait, EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM); } /* * Pass incoming events to all connected clients. */ static unsigned int evdev_events(struct input_handle *handle, struct input_value *vals, unsigned int count) { struct evdev *evdev = handle->private; struct evdev_client *client; ktime_t *ev_time = input_get_timestamp(handle->dev); rcu_read_lock(); client = rcu_dereference(evdev->grab); if (client) evdev_pass_values(client, vals, count, ev_time); else list_for_each_entry_rcu(client, &evdev->client_list, node) evdev_pass_values(client, vals, count, ev_time); rcu_read_unlock(); return count; } static int evdev_fasync(int fd, struct file *file, int on) { struct evdev_client *client = file->private_data; return fasync_helper(fd, file, on, &client->fasync); } static void evdev_free(struct device *dev) { struct evdev *evdev = container_of(dev, struct evdev, dev); input_put_device(evdev->handle.dev); kfree(evdev); } /* * Grabs an event device (along with underlying input device). * This function is called with evdev->mutex taken. */ static int evdev_grab(struct evdev *evdev, struct evdev_client *client) { int error; if (evdev->grab) return -EBUSY; error = input_grab_device(&evdev->handle); if (error) return error; rcu_assign_pointer(evdev->grab, client); return 0; } static int evdev_ungrab(struct evdev *evdev, struct evdev_client *client) { struct evdev_client *grab = rcu_dereference_protected(evdev->grab, lockdep_is_held(&evdev->mutex)); if (grab != client) return -EINVAL; rcu_assign_pointer(evdev->grab, NULL); synchronize_rcu(); input_release_device(&evdev->handle); return 0; } static void evdev_attach_client(struct evdev *evdev, struct evdev_client *client) { spin_lock(&evdev->client_lock); list_add_tail_rcu(&client->node, &evdev->client_list); spin_unlock(&evdev->client_lock); } static void evdev_detach_client(struct evdev *evdev, struct evdev_client *client) { spin_lock(&evdev->client_lock); list_del_rcu(&client->node); spin_unlock(&evdev->client_lock); synchronize_rcu(); } static int evdev_open_device(struct evdev *evdev) { int retval; retval = mutex_lock_interruptible(&evdev->mutex); if (retval) return retval; if (!evdev->exist) retval = -ENODEV; else if (!evdev->open++) { retval = input_open_device(&evdev->handle); if (retval) evdev->open--; } mutex_unlock(&evdev->mutex); return retval; } static void evdev_close_device(struct evdev *evdev) { mutex_lock(&evdev->mutex); if (evdev->exist && !--evdev->open) input_close_device(&evdev->handle); mutex_unlock(&evdev->mutex); } /* * Wake up users waiting for IO so they can disconnect from * dead device. */ static void evdev_hangup(struct evdev *evdev) { struct evdev_client *client; spin_lock(&evdev->client_lock); list_for_each_entry(client, &evdev->client_list, node) { kill_fasync(&client->fasync, SIGIO, POLL_HUP); wake_up_interruptible_poll(&client->wait, EPOLLHUP | EPOLLERR); } spin_unlock(&evdev->client_lock); } static int evdev_release(struct inode *inode, struct file *file) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; unsigned int i; mutex_lock(&evdev->mutex); if (evdev->exist && !client->revoked) input_flush_device(&evdev->handle, file); evdev_ungrab(evdev, client); mutex_unlock(&evdev->mutex); evdev_detach_client(evdev, client); for (i = 0; i < EV_CNT; ++i) bitmap_free(client->evmasks[i]); kvfree(client); evdev_close_device(evdev); return 0; } static unsigned int evdev_compute_buffer_size(struct input_dev *dev) { unsigned int n_events = max(dev->hint_events_per_packet * EVDEV_BUF_PACKETS, EVDEV_MIN_BUFFER_SIZE); return roundup_pow_of_two(n_events); } static int evdev_open(struct inode *inode, struct file *file) { struct evdev *evdev = container_of(inode->i_cdev, struct evdev, cdev); unsigned int bufsize = evdev_compute_buffer_size(evdev->handle.dev); struct evdev_client *client; int error; client = kvzalloc(struct_size(client, buffer, bufsize), GFP_KERNEL); if (!client) return -ENOMEM; init_waitqueue_head(&client->wait); client->bufsize = bufsize; spin_lock_init(&client->buffer_lock); client->evdev = evdev; evdev_attach_client(evdev, client); error = evdev_open_device(evdev); if (error) goto err_free_client; file->private_data = client; stream_open(inode, file); return 0; err_free_client: evdev_detach_client(evdev, client); kvfree(client); return error; } static ssize_t evdev_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; struct input_event event; int retval = 0; /* * Limit amount of data we inject into the input subsystem so that * we do not hold evdev->mutex for too long. 4096 bytes corresponds * to 170 input events. */ count = min(count, 4096); if (count != 0 && count < input_event_size()) return -EINVAL; retval = mutex_lock_interruptible(&evdev->mutex); if (retval) return retval; if (!evdev->exist || client->revoked) { retval = -ENODEV; goto out; } while (retval + input_event_size() <= count) { if (input_event_from_user(buffer + retval, &event)) { retval = -EFAULT; goto out; } retval += input_event_size(); input_inject_event(&evdev->handle, event.type, event.code, event.value); cond_resched(); } out: mutex_unlock(&evdev->mutex); return retval; } static int evdev_fetch_next_event(struct evdev_client *client, struct input_event *event) { int have_event; spin_lock_irq(&client->buffer_lock); have_event = client->packet_head != client->tail; if (have_event) { *event = client->buffer[client->tail++]; client->tail &= client->bufsize - 1; } spin_unlock_irq(&client->buffer_lock); return have_event; } static ssize_t evdev_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; struct input_event event; size_t read = 0; int error; if (count != 0 && count < input_event_size()) return -EINVAL; for (;;) { if (!evdev->exist || client->revoked) return -ENODEV; if (client->packet_head == client->tail && (file->f_flags & O_NONBLOCK)) return -EAGAIN; /* * count == 0 is special - no IO is done but we check * for error conditions (see above). */ if (count == 0) break; while (read + input_event_size() <= count && evdev_fetch_next_event(client, &event)) { if (input_event_to_user(buffer + read, &event)) return -EFAULT; read += input_event_size(); } if (read) break; if (!(file->f_flags & O_NONBLOCK)) { error = wait_event_interruptible(client->wait, client->packet_head != client->tail || !evdev->exist || client->revoked); if (error) return error; } } return read; } /* No kernel lock - fine */ static __poll_t evdev_poll(struct file *file, poll_table *wait) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; __poll_t mask; poll_wait(file, &client->wait, wait); if (evdev->exist && !client->revoked) mask = EPOLLOUT | EPOLLWRNORM; else mask = EPOLLHUP | EPOLLERR; if (client->packet_head != client->tail) mask |= EPOLLIN | EPOLLRDNORM; return mask; } #ifdef CONFIG_COMPAT #define BITS_PER_LONG_COMPAT (sizeof(compat_long_t) * 8) #define BITS_TO_LONGS_COMPAT(x) ((((x) - 1) / BITS_PER_LONG_COMPAT) + 1) #ifdef __BIG_ENDIAN static int bits_to_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, void __user *p, int compat) { int len, i; if (compat) { len = BITS_TO_LONGS_COMPAT(maxbit) * sizeof(compat_long_t); if (len > maxlen) len = maxlen; for (i = 0; i < len / sizeof(compat_long_t); i++) if (copy_to_user((compat_long_t __user *) p + i, (compat_long_t *) bits + i + 1 - ((i % 2) << 1), sizeof(compat_long_t))) return -EFAULT; } else { len = BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; if (copy_to_user(p, bits, len)) return -EFAULT; } return len; } static int bits_from_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, const void __user *p, int compat) { int len, i; if (compat) { if (maxlen % sizeof(compat_long_t)) return -EINVAL; len = BITS_TO_LONGS_COMPAT(maxbit) * sizeof(compat_long_t); if (len > maxlen) len = maxlen; for (i = 0; i < len / sizeof(compat_long_t); i++) if (copy_from_user((compat_long_t *) bits + i + 1 - ((i % 2) << 1), (compat_long_t __user *) p + i, sizeof(compat_long_t))) return -EFAULT; if (i % 2) *((compat_long_t *) bits + i - 1) = 0; } else { if (maxlen % sizeof(long)) return -EINVAL; len = BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; if (copy_from_user(bits, p, len)) return -EFAULT; } return len; } #else static int bits_to_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, void __user *p, int compat) { int len = compat ? BITS_TO_LONGS_COMPAT(maxbit) * sizeof(compat_long_t) : BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; return copy_to_user(p, bits, len) ? -EFAULT : len; } static int bits_from_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, const void __user *p, int compat) { size_t chunk_size = compat ? sizeof(compat_long_t) : sizeof(long); int len; if (maxlen % chunk_size) return -EINVAL; len = compat ? BITS_TO_LONGS_COMPAT(maxbit) : BITS_TO_LONGS(maxbit); len *= chunk_size; if (len > maxlen) len = maxlen; return copy_from_user(bits, p, len) ? -EFAULT : len; } #endif /* __BIG_ENDIAN */ #else static int bits_to_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, void __user *p, int compat) { int len = BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; return copy_to_user(p, bits, len) ? -EFAULT : len; } static int bits_from_user(unsigned long *bits, unsigned int maxbit, unsigned int maxlen, const void __user *p, int compat) { int len; if (maxlen % sizeof(long)) return -EINVAL; len = BITS_TO_LONGS(maxbit) * sizeof(long); if (len > maxlen) len = maxlen; return copy_from_user(bits, p, len) ? -EFAULT : len; } #endif /* CONFIG_COMPAT */ static int str_to_user(const char *str, unsigned int maxlen, void __user *p) { int len; if (!str) return -ENOENT; len = strlen(str) + 1; if (len > maxlen) len = maxlen; return copy_to_user(p, str, len) ? -EFAULT : len; } static int handle_eviocgbit(struct input_dev *dev, unsigned int type, unsigned int size, void __user *p, int compat_mode) { unsigned long *bits; int len; switch (type) { case 0: bits = dev->evbit; len = EV_MAX; break; case EV_KEY: bits = dev->keybit; len = KEY_MAX; break; case EV_REL: bits = dev->relbit; len = REL_MAX; break; case EV_ABS: bits = dev->absbit; len = ABS_MAX; break; case EV_MSC: bits = dev->mscbit; len = MSC_MAX; break; case EV_LED: bits = dev->ledbit; len = LED_MAX; break; case EV_SND: bits = dev->sndbit; len = SND_MAX; break; case EV_FF: bits = dev->ffbit; len = FF_MAX; break; case EV_SW: bits = dev->swbit; len = SW_MAX; break; default: return -EINVAL; } return bits_to_user(bits, len, size, p, compat_mode); } static int evdev_handle_get_keycode(struct input_dev *dev, void __user *p) { struct input_keymap_entry ke = { .len = sizeof(unsigned int), .flags = 0, }; int __user *ip = (int __user *)p; int error; /* legacy case */ if (copy_from_user(ke.scancode, p, sizeof(unsigned int))) return -EFAULT; error = input_get_keycode(dev, &ke); if (error) return error; if (put_user(ke.keycode, ip + 1)) return -EFAULT; return 0; } static int evdev_handle_get_keycode_v2(struct input_dev *dev, void __user *p) { struct input_keymap_entry ke; int error; if (copy_from_user(&ke, p, sizeof(ke))) return -EFAULT; error = input_get_keycode(dev, &ke); if (error) return error; if (copy_to_user(p, &ke, sizeof(ke))) return -EFAULT; return 0; } static int evdev_handle_set_keycode(struct input_dev *dev, void __user *p) { struct input_keymap_entry ke = { .len = sizeof(unsigned int), .flags = 0, }; int __user *ip = (int __user *)p; if (copy_from_user(ke.scancode, p, sizeof(unsigned int))) return -EFAULT; if (get_user(ke.keycode, ip + 1)) return -EFAULT; return input_set_keycode(dev, &ke); } static int evdev_handle_set_keycode_v2(struct input_dev *dev, void __user *p) { struct input_keymap_entry ke; if (copy_from_user(&ke, p, sizeof(ke))) return -EFAULT; if (ke.len > sizeof(ke.scancode)) return -EINVAL; return input_set_keycode(dev, &ke); } /* * If we transfer state to the user, we should flush all pending events * of the same type from the client's queue. Otherwise, they might end up * with duplicate events, which can screw up client's state tracking. * If bits_to_user fails after flushing the queue, we queue a SYN_DROPPED * event so user-space will notice missing events. * * LOCKING: * We need to take event_lock before buffer_lock to avoid dead-locks. But we * need the even_lock only to guarantee consistent state. We can safely release * it while flushing the queue. This allows input-core to handle filters while * we flush the queue. */ static int evdev_handle_get_val(struct evdev_client *client, struct input_dev *dev, unsigned int type, unsigned long *bits, unsigned int maxbit, unsigned int maxlen, void __user *p, int compat) { int ret; unsigned long *mem; mem = bitmap_alloc(maxbit, GFP_KERNEL); if (!mem) return -ENOMEM; spin_lock_irq(&dev->event_lock); spin_lock(&client->buffer_lock); bitmap_copy(mem, bits, maxbit); spin_unlock(&dev->event_lock); __evdev_flush_queue(client, type); spin_unlock_irq(&client->buffer_lock); ret = bits_to_user(mem, maxbit, maxlen, p, compat); if (ret < 0) evdev_queue_syn_dropped(client); bitmap_free(mem); return ret; } static int evdev_handle_mt_request(struct input_dev *dev, unsigned int size, int __user *ip) { const struct input_mt *mt = dev->mt; unsigned int code; int max_slots; int i; if (get_user(code, &ip[0])) return -EFAULT; if (!mt || !input_is_mt_value(code)) return -EINVAL; max_slots = (size - sizeof(__u32)) / sizeof(__s32); for (i = 0; i < mt->num_slots && i < max_slots; i++) { int value = input_mt_get_value(&mt->slots[i], code); if (put_user(value, &ip[1 + i])) return -EFAULT; } return 0; } static int evdev_revoke(struct evdev *evdev, struct evdev_client *client, struct file *file) { client->revoked = true; evdev_ungrab(evdev, client); input_flush_device(&evdev->handle, file); wake_up_interruptible_poll(&client->wait, EPOLLHUP | EPOLLERR); return 0; } /* must be called with evdev-mutex held */ static int evdev_set_mask(struct evdev_client *client, unsigned int type, const void __user *codes, u32 codes_size, int compat) { unsigned long flags, *mask, *oldmask; size_t cnt; int error; /* we allow unknown types and 'codes_size > size' for forward-compat */ cnt = evdev_get_mask_cnt(type); if (!cnt) return 0; mask = bitmap_zalloc(cnt, GFP_KERNEL); if (!mask) return -ENOMEM; error = bits_from_user(mask, cnt - 1, codes_size, codes, compat); if (error < 0) { bitmap_free(mask); return error; } spin_lock_irqsave(&client->buffer_lock, flags); oldmask = client->evmasks[type]; client->evmasks[type] = mask; spin_unlock_irqrestore(&client->buffer_lock, flags); bitmap_free(oldmask); return 0; } /* must be called with evdev-mutex held */ static int evdev_get_mask(struct evdev_client *client, unsigned int type, void __user *codes, u32 codes_size, int compat) { unsigned long *mask; size_t cnt, size, xfer_size; int i; int error; /* we allow unknown types and 'codes_size > size' for forward-compat */ cnt = evdev_get_mask_cnt(type); size = sizeof(unsigned long) * BITS_TO_LONGS(cnt); xfer_size = min_t(size_t, codes_size, size); if (cnt > 0) { mask = client->evmasks[type]; if (mask) { error = bits_to_user(mask, cnt - 1, xfer_size, codes, compat); if (error < 0) return error; } else { /* fake mask with all bits set */ for (i = 0; i < xfer_size; i++) if (put_user(0xffU, (u8 __user *)codes + i)) return -EFAULT; } } if (xfer_size < codes_size) if (clear_user(codes + xfer_size, codes_size - xfer_size)) return -EFAULT; return 0; } static long evdev_do_ioctl(struct file *file, unsigned int cmd, void __user *p, int compat_mode) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; struct input_dev *dev = evdev->handle.dev; struct input_absinfo abs; struct input_mask mask; struct ff_effect effect; int __user *ip = (int __user *)p; unsigned int i, t, u, v; unsigned int size; int error; /* First we check for fixed-length commands */ switch (cmd) { case EVIOCGVERSION: return put_user(EV_VERSION, ip); case EVIOCGID: if (copy_to_user(p, &dev->id, sizeof(struct input_id))) return -EFAULT; return 0; case EVIOCGREP: if (!test_bit(EV_REP, dev->evbit)) return -ENOSYS; if (put_user(dev->rep[REP_DELAY], ip)) return -EFAULT; if (put_user(dev->rep[REP_PERIOD], ip + 1)) return -EFAULT; return 0; case EVIOCSREP: if (!test_bit(EV_REP, dev->evbit)) return -ENOSYS; if (get_user(u, ip)) return -EFAULT; if (get_user(v, ip + 1)) return -EFAULT; input_inject_event(&evdev->handle, EV_REP, REP_DELAY, u); input_inject_event(&evdev->handle, EV_REP, REP_PERIOD, v); return 0; case EVIOCRMFF: return input_ff_erase(dev, (int)(unsigned long) p, file); case EVIOCGEFFECTS: i = test_bit(EV_FF, dev->evbit) ? dev->ff->max_effects : 0; if (put_user(i, ip)) return -EFAULT; return 0; case EVIOCGRAB: if (p) return evdev_grab(evdev, client); else return evdev_ungrab(evdev, client); case EVIOCREVOKE: if (p) return -EINVAL; else return evdev_revoke(evdev, client, file); case EVIOCGMASK: { void __user *codes_ptr; if (copy_from_user(&mask, p, sizeof(mask))) return -EFAULT; codes_ptr = (void __user *)(unsigned long)mask.codes_ptr; return evdev_get_mask(client, mask.type, codes_ptr, mask.codes_size, compat_mode); } case EVIOCSMASK: { const void __user *codes_ptr; if (copy_from_user(&mask, p, sizeof(mask))) return -EFAULT; codes_ptr = (const void __user *)(unsigned long)mask.codes_ptr; return evdev_set_mask(client, mask.type, codes_ptr, mask.codes_size, compat_mode); } case EVIOCSCLOCKID: if (copy_from_user(&i, p, sizeof(unsigned int))) return -EFAULT; return evdev_set_clk_type(client, i); case EVIOCGKEYCODE: return evdev_handle_get_keycode(dev, p); case EVIOCSKEYCODE: return evdev_handle_set_keycode(dev, p); case EVIOCGKEYCODE_V2: return evdev_handle_get_keycode_v2(dev, p); case EVIOCSKEYCODE_V2: return evdev_handle_set_keycode_v2(dev, p); } size = _IOC_SIZE(cmd); /* Now check variable-length commands */ #define EVIOC_MASK_SIZE(nr) ((nr) & ~(_IOC_SIZEMASK << _IOC_SIZESHIFT)) switch (EVIOC_MASK_SIZE(cmd)) { case EVIOCGPROP(0): return bits_to_user(dev->propbit, INPUT_PROP_MAX, size, p, compat_mode); case EVIOCGMTSLOTS(0): return evdev_handle_mt_request(dev, size, ip); case EVIOCGKEY(0): return evdev_handle_get_val(client, dev, EV_KEY, dev->key, KEY_MAX, size, p, compat_mode); case EVIOCGLED(0): return evdev_handle_get_val(client, dev, EV_LED, dev->led, LED_MAX, size, p, compat_mode); case EVIOCGSND(0): return evdev_handle_get_val(client, dev, EV_SND, dev->snd, SND_MAX, size, p, compat_mode); case EVIOCGSW(0): return evdev_handle_get_val(client, dev, EV_SW, dev->sw, SW_MAX, size, p, compat_mode); case EVIOCGNAME(0): return str_to_user(dev->name, size, p); case EVIOCGPHYS(0): return str_to_user(dev->phys, size, p); case EVIOCGUNIQ(0): return str_to_user(dev->uniq, size, p); case EVIOC_MASK_SIZE(EVIOCSFF): if (input_ff_effect_from_user(p, size, &effect)) return -EFAULT; error = input_ff_upload(dev, &effect, file); if (error) return error; if (put_user(effect.id, &(((struct ff_effect __user *)p)->id))) return -EFAULT; return 0; } /* Multi-number variable-length handlers */ if (_IOC_TYPE(cmd) != 'E') return -EINVAL; if (_IOC_DIR(cmd) == _IOC_READ) { if ((_IOC_NR(cmd) & ~EV_MAX) == _IOC_NR(EVIOCGBIT(0, 0))) return handle_eviocgbit(dev, _IOC_NR(cmd) & EV_MAX, size, p, compat_mode); if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCGABS(0))) { if (!dev->absinfo) return -EINVAL; t = _IOC_NR(cmd) & ABS_MAX; abs = dev->absinfo[t]; if (copy_to_user(p, &abs, min_t(size_t, size, sizeof(struct input_absinfo)))) return -EFAULT; return 0; } } if (_IOC_DIR(cmd) == _IOC_WRITE) { if ((_IOC_NR(cmd) & ~ABS_MAX) == _IOC_NR(EVIOCSABS(0))) { if (!dev->absinfo) return -EINVAL; t = _IOC_NR(cmd) & ABS_MAX; if (copy_from_user(&abs, p, min_t(size_t, size, sizeof(struct input_absinfo)))) return -EFAULT; if (size < sizeof(struct input_absinfo)) abs.resolution = 0; /* We can't change number of reserved MT slots */ if (t == ABS_MT_SLOT) return -EINVAL; /* * Take event lock to ensure that we are not * changing device parameters in the middle * of event. */ spin_lock_irq(&dev->event_lock); dev->absinfo[t] = abs; spin_unlock_irq(&dev->event_lock); return 0; } } return -EINVAL; } static long evdev_ioctl_handler(struct file *file, unsigned int cmd, void __user *p, int compat_mode) { struct evdev_client *client = file->private_data; struct evdev *evdev = client->evdev; int retval; retval = mutex_lock_interruptible(&evdev->mutex); if (retval) return retval; if (!evdev->exist || client->revoked) { retval = -ENODEV; goto out; } retval = evdev_do_ioctl(file, cmd, p, compat_mode); out: mutex_unlock(&evdev->mutex); return retval; } static long evdev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return evdev_ioctl_handler(file, cmd, (void __user *)arg, 0); } #ifdef CONFIG_COMPAT static long evdev_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { return evdev_ioctl_handler(file, cmd, compat_ptr(arg), 1); } #endif static const struct file_operations evdev_fops = { .owner = THIS_MODULE, .read = evdev_read, .write = evdev_write, .poll = evdev_poll, .open = evdev_open, .release = evdev_release, .unlocked_ioctl = evdev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = evdev_ioctl_compat, #endif .fasync = evdev_fasync, }; /* * Mark device non-existent. This disables writes, ioctls and * prevents new users from opening the device. Already posted * blocking reads will stay, however new ones will fail. */ static void evdev_mark_dead(struct evdev *evdev) { mutex_lock(&evdev->mutex); evdev->exist = false; mutex_unlock(&evdev->mutex); } static void evdev_cleanup(struct evdev *evdev) { struct input_handle *handle = &evdev->handle; evdev_mark_dead(evdev); evdev_hangup(evdev); /* evdev is marked dead so no one else accesses evdev->open */ if (evdev->open) { input_flush_device(handle, NULL); input_close_device(handle); } } /* * Create new evdev device. Note that input core serializes calls * to connect and disconnect. */ static int evdev_connect(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id) { struct evdev *evdev; int minor; int dev_no; int error; minor = input_get_new_minor(EVDEV_MINOR_BASE, EVDEV_MINORS, true); if (minor < 0) { error = minor; pr_err("failed to reserve new minor: %d\n", error); return error; } evdev = kzalloc(sizeof(struct evdev), GFP_KERNEL); if (!evdev) { error = -ENOMEM; goto err_free_minor; } INIT_LIST_HEAD(&evdev->client_list); spin_lock_init(&evdev->client_lock); mutex_init(&evdev->mutex); evdev->exist = true; dev_no = minor; /* Normalize device number if it falls into legacy range */ if (dev_no < EVDEV_MINOR_BASE + EVDEV_MINORS) dev_no -= EVDEV_MINOR_BASE; dev_set_name(&evdev->dev, "event%d", dev_no); evdev->handle.dev = input_get_device(dev); evdev->handle.name = dev_name(&evdev->dev); evdev->handle.handler = handler; evdev->handle.private = evdev; evdev->dev.devt = MKDEV(INPUT_MAJOR, minor); evdev->dev.class = &input_class; evdev->dev.parent = &dev->dev; evdev->dev.release = evdev_free; device_initialize(&evdev->dev); error = input_register_handle(&evdev->handle); if (error) goto err_free_evdev; cdev_init(&evdev->cdev, &evdev_fops); error = cdev_device_add(&evdev->cdev, &evdev->dev); if (error) goto err_cleanup_evdev; return 0; err_cleanup_evdev: evdev_cleanup(evdev); input_unregister_handle(&evdev->handle); err_free_evdev: put_device(&evdev->dev); err_free_minor: input_free_minor(minor); return error; } static void evdev_disconnect(struct input_handle *handle) { struct evdev *evdev = handle->private; cdev_device_del(&evdev->cdev, &evdev->dev); evdev_cleanup(evdev); input_free_minor(MINOR(evdev->dev.devt)); input_unregister_handle(handle); put_device(&evdev->dev); } static const struct input_device_id evdev_ids[] = { { /* Matches all devices */ .flags = INPUT_DEVICE_ID_MATCH_EVBIT, .evbit = { BIT_MASK(EV_SYN) }, }, { } /* Terminating zero entry */ }; MODULE_DEVICE_TABLE(input, evdev_ids); static struct input_handler evdev_handler = { .events = evdev_events, .connect = evdev_connect, .disconnect = evdev_disconnect, .legacy_minors = true, .minor = EVDEV_MINOR_BASE, .name = "evdev", .id_table = evdev_ids, }; static int __init evdev_init(void) { return input_register_handler(&evdev_handler); } static void __exit evdev_exit(void) { input_unregister_handler(&evdev_handler); } module_init(evdev_init); module_exit(evdev_exit); MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>"); MODULE_DESCRIPTION("Input driver event char devices"); MODULE_LICENSE("GPL"); |
| 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 | // SPDX-License-Identifier: GPL-2.0-only /* * Input device TTY line discipline * * Copyright (c) 1999-2002 Vojtech Pavlik * * This is a module that converts a tty line into a much simpler * 'serial io port' abstraction that the input device drivers use. */ #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/init.h> #include <linux/serio.h> #include <linux/tty.h> #include <linux/compat.h> MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>"); MODULE_DESCRIPTION("Input device TTY line discipline"); MODULE_LICENSE("GPL"); MODULE_ALIAS_LDISC(N_MOUSE); #define SERPORT_BUSY 1 #define SERPORT_ACTIVE 2 #define SERPORT_DEAD 3 struct serport { struct tty_struct *tty; wait_queue_head_t wait; struct serio *serio; struct serio_device_id id; spinlock_t lock; unsigned long flags; }; /* * Callback functions from the serio code. */ static int serport_serio_write(struct serio *serio, unsigned char data) { struct serport *serport = serio->port_data; return -(serport->tty->ops->write(serport->tty, &data, 1) != 1); } static int serport_serio_open(struct serio *serio) { struct serport *serport = serio->port_data; guard(spinlock_irqsave)(&serport->lock); set_bit(SERPORT_ACTIVE, &serport->flags); return 0; } static void serport_serio_close(struct serio *serio) { struct serport *serport = serio->port_data; guard(spinlock_irqsave)(&serport->lock); clear_bit(SERPORT_ACTIVE, &serport->flags); } /* * serport_ldisc_open() is the routine that is called upon setting our line * discipline on a tty. It prepares the serio struct. */ static int serport_ldisc_open(struct tty_struct *tty) { struct serport *serport; if (!capable(CAP_SYS_ADMIN)) return -EPERM; serport = kzalloc(sizeof(*serport), GFP_KERNEL); if (!serport) return -ENOMEM; serport->tty = tty; spin_lock_init(&serport->lock); init_waitqueue_head(&serport->wait); tty->disc_data = serport; tty->receive_room = 256; set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); return 0; } /* * serport_ldisc_close() is the opposite of serport_ldisc_open() */ static void serport_ldisc_close(struct tty_struct *tty) { struct serport *serport = tty->disc_data; kfree(serport); } /* * serport_ldisc_receive() is called by the low level tty driver when characters * are ready for us. We forward the characters and flags, one by one to the * 'interrupt' routine. */ static void serport_ldisc_receive(struct tty_struct *tty, const u8 *cp, const u8 *fp, size_t count) { struct serport *serport = tty->disc_data; unsigned int ch_flags = 0; int i; guard(spinlock_irqsave)(&serport->lock); if (!test_bit(SERPORT_ACTIVE, &serport->flags)) return; for (i = 0; i < count; i++) { if (fp) { switch (fp[i]) { case TTY_FRAME: ch_flags = SERIO_FRAME; break; case TTY_PARITY: ch_flags = SERIO_PARITY; break; default: ch_flags = 0; break; } } serio_interrupt(serport->serio, cp[i], ch_flags); } } /* * serport_ldisc_read() just waits indefinitely if everything goes well. * However, when the serio driver closes the serio port, it finishes, * returning 0 characters. */ static ssize_t serport_ldisc_read(struct tty_struct * tty, struct file * file, u8 *kbuf, size_t nr, void **cookie, unsigned long offset) { struct serport *serport = tty->disc_data; struct serio *serio; if (test_and_set_bit(SERPORT_BUSY, &serport->flags)) return -EBUSY; serport->serio = serio = kzalloc(sizeof(*serio), GFP_KERNEL); if (!serio) return -ENOMEM; strscpy(serio->name, "Serial port", sizeof(serio->name)); snprintf(serio->phys, sizeof(serio->phys), "%s/serio0", tty_name(tty)); serio->id = serport->id; serio->id.type = SERIO_RS232; serio->write = serport_serio_write; serio->open = serport_serio_open; serio->close = serport_serio_close; serio->port_data = serport; serio->dev.parent = tty->dev; serio_register_port(serport->serio); printk(KERN_INFO "serio: Serial port %s\n", tty_name(tty)); wait_event_interruptible(serport->wait, test_bit(SERPORT_DEAD, &serport->flags)); serio_unregister_port(serport->serio); serport->serio = NULL; clear_bit(SERPORT_DEAD, &serport->flags); clear_bit(SERPORT_BUSY, &serport->flags); return 0; } static void serport_set_type(struct tty_struct *tty, unsigned long type) { struct serport *serport = tty->disc_data; serport->id.proto = type & 0x000000ff; serport->id.id = (type & 0x0000ff00) >> 8; serport->id.extra = (type & 0x00ff0000) >> 16; } /* * serport_ldisc_ioctl() allows to set the port protocol, and device ID */ static int serport_ldisc_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { if (cmd == SPIOCSTYPE) { unsigned long type; if (get_user(type, (unsigned long __user *) arg)) return -EFAULT; serport_set_type(tty, type); return 0; } return -EINVAL; } #ifdef CONFIG_COMPAT #define COMPAT_SPIOCSTYPE _IOW('q', 0x01, compat_ulong_t) static int serport_ldisc_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { if (cmd == COMPAT_SPIOCSTYPE) { void __user *uarg = compat_ptr(arg); compat_ulong_t compat_type; if (get_user(compat_type, (compat_ulong_t __user *)uarg)) return -EFAULT; serport_set_type(tty, compat_type); return 0; } return -EINVAL; } #endif static void serport_ldisc_hangup(struct tty_struct *tty) { struct serport *serport = tty->disc_data; scoped_guard(spinlock_irqsave, &serport->lock) set_bit(SERPORT_DEAD, &serport->flags); wake_up_interruptible(&serport->wait); } static void serport_ldisc_write_wakeup(struct tty_struct * tty) { struct serport *serport = tty->disc_data; guard(spinlock_irqsave)(&serport->lock); if (test_bit(SERPORT_ACTIVE, &serport->flags)) serio_drv_write_wakeup(serport->serio); } /* * The line discipline structure. */ static struct tty_ldisc_ops serport_ldisc = { .owner = THIS_MODULE, .num = N_MOUSE, .name = "input", .open = serport_ldisc_open, .close = serport_ldisc_close, .read = serport_ldisc_read, .ioctl = serport_ldisc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = serport_ldisc_compat_ioctl, #endif .receive_buf = serport_ldisc_receive, .hangup = serport_ldisc_hangup, .write_wakeup = serport_ldisc_write_wakeup }; /* * The functions for insering/removing us as a module. */ static int __init serport_init(void) { int retval; retval = tty_register_ldisc(&serport_ldisc); if (retval) printk(KERN_ERR "serport.c: Error registering line discipline.\n"); return retval; } static void __exit serport_exit(void) { tty_unregister_ldisc(&serport_ldisc); } module_init(serport_init); module_exit(serport_exit); |
| 1 1 12 12 1 1 1 754 333 423 30 726 8 8 756 756 755 755 421 421 12 12 12 12 12 12 29 30 34 33 272 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 | // SPDX-License-Identifier: GPL-2.0 /* * Tag allocation using scalable bitmaps. Uses active queue tracking to support * fairer distribution of tags between multiple submitters when a shared tag map * is used. * * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/kmemleak.h> #include <linux/delay.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" /* * Recalculate wakeup batch when tag is shared by hctx. */ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, unsigned int users) { if (!users) return; sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags, users); sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags, users); } /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail * to get tag when first time, the other shared-tag users could reserve * budget for it. */ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { unsigned int users; unsigned long flags; struct blk_mq_tags *tags = hctx->tags; /* * calling test_bit() prior to test_and_set_bit() is intentional, * it avoids dirtying the cacheline if the queue is already active. */ if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; } else { if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; } spin_lock_irqsave(&tags->lock, flags); users = tags->active_queues + 1; WRITE_ONCE(tags->active_queues, users); blk_mq_update_wake_batch(tags, users); spin_unlock_irqrestore(&tags->lock, flags); } /* * Wakeup all potentially sleeping on tags */ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { sbitmap_queue_wake_all(&tags->bitmap_tags); if (include_reserve) sbitmap_queue_wake_all(&tags->breserved_tags); } /* * If a previously busy queue goes inactive, potential waiters could now * be allowed to queue. Wake them up and check. */ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; unsigned int users; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; } else { if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; } spin_lock_irq(&tags->lock); users = tags->active_queues - 1; WRITE_ONCE(tags->active_queues, users); blk_mq_update_wake_batch(tags, users); spin_unlock_irq(&tags->lock); blk_mq_tag_wakeup_all(tags, false); } static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && !hctx_may_queue(data->hctx, bt)) return BLK_MQ_NO_TAG; if (data->shallow_depth) return sbitmap_queue_get_shallow(bt, data->shallow_depth); else return __sbitmap_queue_get(bt); } unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, unsigned int *offset) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct sbitmap_queue *bt = &tags->bitmap_tags; unsigned long ret; if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED || data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) return 0; ret = __sbitmap_queue_get_batch(bt, nr_tags, offset); *offset += tags->nr_reserved_tags; return ret; } unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct sbitmap_queue *bt; struct sbq_wait_state *ws; DEFINE_SBQ_WAIT(wait); unsigned int tag_offset; int tag; if (data->flags & BLK_MQ_REQ_RESERVED) { if (unlikely(!tags->nr_reserved_tags)) { WARN_ON_ONCE(1); return BLK_MQ_NO_TAG; } bt = &tags->breserved_tags; tag_offset = 0; } else { bt = &tags->bitmap_tags; tag_offset = tags->nr_reserved_tags; } tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) goto found_tag; if (data->flags & BLK_MQ_REQ_NOWAIT) return BLK_MQ_NO_TAG; ws = bt_wait_ptr(bt, data->hctx); do { struct sbitmap_queue *bt_prev; /* * We're out of tags on this hardware queue, kick any * pending IO submits before going to sleep waiting for * some to complete. */ blk_mq_run_hw_queue(data->hctx, false); /* * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) break; sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); tag = __blk_mq_get_tag(data, bt); if (tag != BLK_MQ_NO_TAG) break; bt_prev = bt; io_schedule(); sbitmap_finish_wait(bt, ws, &wait); data->ctx = blk_mq_get_ctx(data->q); data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) bt = &tags->breserved_tags; else bt = &tags->bitmap_tags; /* * If destination hw queue is changed, fake wake up on * previous queue for compensating the wake up miss, so * other allocations on previous queue won't be starved. */ if (bt != bt_prev) sbitmap_queue_wake_up(bt_prev, 1); ws = bt_wait_ptr(bt, data->hctx); } while (1); sbitmap_finish_wait(bt, ws, &wait); found_tag: /* * Give up this allocation if the hctx is inactive. The caller will * retry on an active hctx. */ if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) { blk_mq_put_tag(tags, data->ctx, tag + tag_offset); return BLK_MQ_NO_TAG; } return tag + tag_offset; } void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag) { if (!blk_mq_tag_is_reserved(tags, tag)) { const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); } else { sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); } } void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags) { sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags, tag_array, nr_tags); } struct bt_iter_data { struct blk_mq_hw_ctx *hctx; struct request_queue *q; busy_tag_iter_fn *fn; void *data; bool reserved; }; static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, unsigned int bitnr) { struct request *rq; rq = tags->rqs[bitnr]; if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq)) rq = NULL; return rq; } static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; struct blk_mq_hw_ctx *hctx = iter_data->hctx; struct request_queue *q = iter_data->q; struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_tags *tags; struct request *rq; bool ret = true; if (blk_mq_is_shared_tags(set->flags)) tags = set->shared_tags; else tags = hctx->tags; if (!iter_data->reserved) bitnr += tags->nr_reserved_tags; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ rq = blk_mq_find_and_get_req(tags, bitnr); if (!rq) return true; if (rq->q == q && (!hctx || rq->mq_hctx == hctx)) ret = iter_data->fn(rq, iter_data->data); blk_mq_put_rq_ref(rq); return ret; } /** * bt_for_each - iterate over the requests associated with a hardware queue * @hctx: Hardware queue to examine. * @q: Request queue @hctx is associated with (@hctx->queue). * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each request * associated with @hctx that has been assigned a driver tag. * @fn will be called as follows: @fn(rq, @data) where rq is a * pointer to a request. Return %true to continue iterating tags; * %false to stop. * @data: Will be passed as second argument to @fn. * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. */ static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q, struct sbitmap_queue *bt, busy_tag_iter_fn *fn, void *data, bool reserved) { struct bt_iter_data iter_data = { .hctx = hctx, .fn = fn, .data = data, .reserved = reserved, .q = q, }; sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); } struct bt_tags_iter_data { struct blk_mq_tags *tags; busy_tag_iter_fn *fn; void *data; unsigned int flags; }; #define BT_TAG_ITER_RESERVED (1 << 0) #define BT_TAG_ITER_STARTED (1 << 1) #define BT_TAG_ITER_STATIC_RQS (1 << 2) static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_tags_iter_data *iter_data = data; struct blk_mq_tags *tags = iter_data->tags; struct request *rq; bool ret = true; bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS); if (!(iter_data->flags & BT_TAG_ITER_RESERVED)) bitnr += tags->nr_reserved_tags; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ if (iter_static_rqs) rq = tags->static_rqs[bitnr]; else rq = blk_mq_find_and_get_req(tags, bitnr); if (!rq) return true; if (!(iter_data->flags & BT_TAG_ITER_STARTED) || blk_mq_request_started(rq)) ret = iter_data->fn(rq, iter_data->data); if (!iter_static_rqs) blk_mq_put_rq_ref(rq); return ret; } /** * bt_tags_for_each - iterate over the requests in a tag map * @tags: Tag map to iterate over. * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @data) where rq * is a pointer to a request. Return %true to continue iterating * tags; %false to stop. * @data: Will be passed as second argument to @fn. * @flags: BT_TAG_ITER_* */ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, busy_tag_iter_fn *fn, void *data, unsigned int flags) { struct bt_tags_iter_data iter_data = { .tags = tags, .fn = fn, .data = data, .flags = flags, }; if (tags->rqs) sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); } static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv, unsigned int flags) { WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); if (tags->nr_reserved_tags) bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, flags | BT_TAG_ITER_RESERVED); bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags); } /** * blk_mq_all_tag_iter - iterate over all requests in a tag map * @tags: Tag map to iterate over. * @fn: Pointer to the function that will be called for each * request. @fn will be called as follows: @fn(rq, @priv) where rq * is a pointer to a request. Return %true to continue iterating * tags; %false to stop. * @priv: Will be passed as second argument to @fn. * * Caller has to pass the tag map from which requests are allocated. */ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv) { __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS); } /** * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set * @tagset: Tag set to iterate over. * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @priv) where * rq is a pointer to a request. Return true to continue iterating * tags, false to stop. * @priv: Will be passed as second argument to @fn. * * We grab one request reference before calling @fn and release it after * @fn returns. */ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) { unsigned int flags = tagset->flags; int i, nr_tags, srcu_idx; srcu_idx = srcu_read_lock(&tagset->tags_srcu); nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues; for (i = 0; i < nr_tags; i++) { if (tagset->tags && tagset->tags[i]) __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, BT_TAG_ITER_STARTED); } srcu_read_unlock(&tagset->tags_srcu, srcu_idx); } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data) { unsigned *count = data; if (blk_mq_request_completed(rq)) (*count)++; return true; } /** * blk_mq_tagset_wait_completed_request - Wait until all scheduled request * completions have finished. * @tagset: Tag set to drain completed request * * Note: This function has to be run after all IO queues are shutdown */ void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) { while (true) { unsigned count = 0; blk_mq_tagset_busy_iter(tagset, blk_mq_tagset_count_completed_rqs, &count); if (!count) break; msleep(5); } } EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); /** * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag * @q: Request queue to examine. * @fn: Pointer to the function that will be called for each request * on @q. @fn will be called as follows: @fn(rq, @priv) where rq * is a pointer to a request and hctx points to the hardware queue * associated with the request. * @priv: Will be passed as second argument to @fn. * * Note: if @q->tag_set is shared with other request queues then @fn will be * called for all requests on all queues that share that tag set and not only * for requests associated with @q. */ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { int srcu_idx; /* * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table * while the queue is frozen. So we can use q_usage_counter to avoid * racing with it. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return; srcu_idx = srcu_read_lock(&q->tag_set->tags_srcu); if (blk_mq_is_shared_tags(q->tag_set->flags)) { struct blk_mq_tags *tags = q->tag_set->shared_tags; struct sbitmap_queue *bresv = &tags->breserved_tags; struct sbitmap_queue *btags = &tags->bitmap_tags; if (tags->nr_reserved_tags) bt_for_each(NULL, q, bresv, fn, priv, true); bt_for_each(NULL, q, btags, fn, priv, false); } else { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { struct blk_mq_tags *tags = hctx->tags; struct sbitmap_queue *bresv = &tags->breserved_tags; struct sbitmap_queue *btags = &tags->bitmap_tags; /* * If no software queues are currently mapped to this * hardware queue, there's nothing to check */ if (!blk_mq_hw_queue_mapped(hctx)) continue; if (tags->nr_reserved_tags) bt_for_each(hctx, q, bresv, fn, priv, true); bt_for_each(hctx, q, btags, fn, priv, false); } } srcu_read_unlock(&q->tag_set->tags_srcu, srcu_idx); blk_queue_exit(q); } static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, bool round_robin, int node) { return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, node); } struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, unsigned int flags, int node) { unsigned int depth = total_tags - reserved_tags; bool round_robin = flags & BLK_MQ_F_TAG_RR; struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { pr_err("blk-mq: tag depth too large\n"); return NULL; } tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); if (!tags) return NULL; tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); INIT_LIST_HEAD(&tags->page_list); if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) goto out_free_tags; if (bt_alloc(&tags->breserved_tags, reserved_tags, round_robin, node)) goto out_free_bitmap_tags; return tags; out_free_bitmap_tags: sbitmap_queue_free(&tags->bitmap_tags); out_free_tags: kfree(tags); return NULL; } static void blk_mq_free_tags_callback(struct rcu_head *head) { struct blk_mq_tags *tags = container_of(head, struct blk_mq_tags, rcu_head); struct page *page; while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); /* * Remove kmemleak object previously allocated in * blk_mq_alloc_rqs(). */ kmemleak_free(page_address(page)); __free_pages(page, page->private); } kfree(tags); } void blk_mq_free_tags(struct blk_mq_tag_set *set, struct blk_mq_tags *tags) { sbitmap_queue_free(&tags->bitmap_tags); sbitmap_queue_free(&tags->breserved_tags); /* if tags pages is not allocated yet, free tags directly */ if (list_empty(&tags->page_list)) { kfree(tags); return; } call_srcu(&set->tags_srcu, &tags->rcu_head, blk_mq_free_tags_callback); } void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size) { struct blk_mq_tags *tags = set->shared_tags; sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags); } void blk_mq_tag_update_sched_shared_tags(struct request_queue *q, unsigned int nr) { sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags, nr - q->tag_set->reserved_tags); } /** * blk_mq_unique_tag() - return a tag that is unique queue-wide * @rq: request for which to compute a unique tag * * The tag field in struct request is unique per hardware queue but not over * all hardware queues. Hence this function that returns a tag with the * hardware context index in the upper bits and the per hardware queue tag in * the lower bits. * * Note: When called for a request that is queued on a non-multiqueue request * queue, the hardware context index is set to zero. */ u32 blk_mq_unique_tag(struct request *rq) { return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) | (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); } EXPORT_SYMBOL(blk_mq_unique_tag); |
| 18 20 11 13 11 20 101 2624 2611 48 13 11 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/etherdevice.h> #include <linux/if_macvlan.h> #include <linux/if_tap.h> #include <linux/if_vlan.h> #include <linux/interrupt.h> #include <linux/nsproxy.h> #include <linux/compat.h> #include <linux/if_tun.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/cache.h> #include <linux/sched/signal.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/wait.h> #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> #include <linux/uio.h> #include <net/net_namespace.h> #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/virtio_net.h> #include <linux/skb_array.h> struct macvtap_dev { struct macvlan_dev vlan; struct tap_dev tap; }; /* * Variables for dealing with macvtaps device numbers. */ static dev_t macvtap_major; static const void *macvtap_net_namespace(const struct device *d) { const struct net_device *dev = to_net_dev(d->parent); return dev_net(dev); } static struct class macvtap_class = { .name = "macvtap", .ns_type = &net_ns_type_operations, .namespace = macvtap_net_namespace, }; static struct cdev macvtap_cdev; #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ NETIF_F_TSO6) static void macvtap_count_tx_dropped(struct tap_dev *tap) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; this_cpu_inc(vlan->pcpu_stats->tx_dropped); } static void macvtap_count_rx_dropped(struct tap_dev *tap) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; macvlan_count_rx(vlan, 0, 0, 0); } static void macvtap_update_features(struct tap_dev *tap, netdev_features_t features) { struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap); struct macvlan_dev *vlan = &vlantap->vlan; vlan->set_features = features; netdev_update_features(vlan->dev); } static int macvtap_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct macvtap_dev *vlantap = netdev_priv(dev); int err; INIT_LIST_HEAD(&vlantap->tap.queue_list); /* Since macvlan supports all offloads by default, make * tap support all offloads also. */ vlantap->tap.tap_features = TUN_OFFLOADS; /* Register callbacks for rx/tx drops accounting and updating * net_device features */ vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped; vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped; vlantap->tap.update_features = macvtap_update_features; err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap); if (err) return err; /* Don't put anything that may fail after macvlan_common_newlink * because we can't undo what it does. */ err = macvlan_common_newlink(dev, params, extack); if (err) { netdev_rx_handler_unregister(dev); return err; } vlantap->tap.dev = vlantap->vlan.dev; return 0; } static void macvtap_dellink(struct net_device *dev, struct list_head *head) { struct macvtap_dev *vlantap = netdev_priv(dev); netdev_rx_handler_unregister(dev); tap_del_queues(&vlantap->tap); macvlan_dellink(dev, head); } static void macvtap_setup(struct net_device *dev) { macvlan_common_setup(dev); dev->tx_queue_len = TUN_READQ_SIZE; } static struct net *macvtap_link_net(const struct net_device *dev) { return dev_net(macvlan_dev_real_dev(dev)); } static struct rtnl_link_ops macvtap_link_ops __read_mostly = { .kind = "macvtap", .setup = macvtap_setup, .newlink = macvtap_newlink, .dellink = macvtap_dellink, .get_link_net = macvtap_link_net, .priv_size = sizeof(struct macvtap_dev), }; static int macvtap_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct macvtap_dev *vlantap; struct device *classdev; dev_t devt; int err; char tap_name[IFNAMSIZ]; if (dev->rtnl_link_ops != &macvtap_link_ops) return NOTIFY_DONE; snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex); vlantap = netdev_priv(dev); switch (event) { case NETDEV_REGISTER: /* Create the device node here after the network device has * been registered but before register_netdevice has * finished running. */ err = tap_get_minor(macvtap_major, &vlantap->tap); if (err) return notifier_from_errno(err); devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor); classdev = device_create(&macvtap_class, &dev->dev, devt, dev, "%s", tap_name); if (IS_ERR(classdev)) { tap_free_minor(macvtap_major, &vlantap->tap); return notifier_from_errno(PTR_ERR(classdev)); } err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj, tap_name); if (err) return notifier_from_errno(err); break; case NETDEV_UNREGISTER: /* vlan->minor == 0 if NETDEV_REGISTER above failed */ if (vlantap->tap.minor == 0) break; sysfs_remove_link(&dev->dev.kobj, tap_name); devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor); device_destroy(&macvtap_class, devt); tap_free_minor(macvtap_major, &vlantap->tap); break; case NETDEV_CHANGE_TX_QUEUE_LEN: if (tap_queue_resize(&vlantap->tap)) return NOTIFY_BAD; break; } return NOTIFY_DONE; } static struct notifier_block macvtap_notifier_block __read_mostly = { .notifier_call = macvtap_device_event, }; static int __init macvtap_init(void) { int err; err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap", THIS_MODULE); if (err) goto out1; err = class_register(&macvtap_class); if (err) goto out2; err = register_netdevice_notifier(&macvtap_notifier_block); if (err) goto out3; err = macvlan_link_register(&macvtap_link_ops); if (err) goto out4; return 0; out4: unregister_netdevice_notifier(&macvtap_notifier_block); out3: class_unregister(&macvtap_class); out2: tap_destroy_cdev(macvtap_major, &macvtap_cdev); out1: return err; } module_init(macvtap_init); static void __exit macvtap_exit(void) { rtnl_link_unregister(&macvtap_link_ops); unregister_netdevice_notifier(&macvtap_notifier_block); class_unregister(&macvtap_class); tap_destroy_cdev(macvtap_major, &macvtap_cdev); } module_exit(macvtap_exit); MODULE_ALIAS_RTNL_LINK("macvtap"); MODULE_DESCRIPTION("MAC-VLAN based tap driver"); MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>"); MODULE_LICENSE("GPL"); |
| 33 24 24 24 24 24 24 24 24 36 33 33 11 33 33 33 33 33 33 33 33 32 33 33 33 33 1 32 33 33 33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 | /* SPDX-License-Identifier: GPL-2.0 * * page_pool.c * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ #include <linux/error-injection.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/device.h> #include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> #include <net/page_pool/memory_provider.h> #include <net/xdp.h> #include <linux/dma-direction.h> #include <linux/dma-mapping.h> #include <linux/page-flags.h> #include <linux/mm.h> /* for put_page() */ #include <linux/poison.h> #include <linux/ethtool.h> #include <linux/netdevice.h> #include <trace/events/page_pool.h> #include "dev.h" #include "mp_dmabuf_devmem.h" #include "netmem_priv.h" #include "page_pool_priv.h" DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers); #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) #define BIAS_MAX (LONG_MAX >> 1) #ifdef CONFIG_PAGE_POOL_STATS static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats); /* alloc_stat_inc is intended to be used in softirq context */ #define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++) /* recycle_stat_inc is safe to use when preemption is possible. */ #define recycle_stat_inc(pool, __stat) \ do { \ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ this_cpu_inc(s->__stat); \ } while (0) #define recycle_stat_add(pool, __stat, val) \ do { \ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \ this_cpu_add(s->__stat, val); \ } while (0) static const char pp_stats[][ETH_GSTRING_LEN] = { "rx_pp_alloc_fast", "rx_pp_alloc_slow", "rx_pp_alloc_slow_ho", "rx_pp_alloc_empty", "rx_pp_alloc_refill", "rx_pp_alloc_waive", "rx_pp_recycle_cached", "rx_pp_recycle_cache_full", "rx_pp_recycle_ring", "rx_pp_recycle_ring_full", "rx_pp_recycle_released_ref", }; /** * page_pool_get_stats() - fetch page pool stats * @pool: pool from which page was allocated * @stats: struct page_pool_stats to fill in * * Retrieve statistics about the page_pool. This API is only available * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``. * A pointer to a caller allocated struct page_pool_stats structure * is passed to this API which is filled in. The caller can then report * those stats to the user (perhaps via ethtool, debugfs, etc.). */ bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats) { int cpu = 0; if (!stats) return false; /* The caller is responsible to initialize stats. */ stats->alloc_stats.fast += pool->alloc_stats.fast; stats->alloc_stats.slow += pool->alloc_stats.slow; stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order; stats->alloc_stats.empty += pool->alloc_stats.empty; stats->alloc_stats.refill += pool->alloc_stats.refill; stats->alloc_stats.waive += pool->alloc_stats.waive; for_each_possible_cpu(cpu) { const struct page_pool_recycle_stats *pcpu = per_cpu_ptr(pool->recycle_stats, cpu); stats->recycle_stats.cached += pcpu->cached; stats->recycle_stats.cache_full += pcpu->cache_full; stats->recycle_stats.ring += pcpu->ring; stats->recycle_stats.ring_full += pcpu->ring_full; stats->recycle_stats.released_refcnt += pcpu->released_refcnt; } return true; } EXPORT_SYMBOL(page_pool_get_stats); u8 *page_pool_ethtool_stats_get_strings(u8 *data) { int i; for (i = 0; i < ARRAY_SIZE(pp_stats); i++) { memcpy(data, pp_stats[i], ETH_GSTRING_LEN); data += ETH_GSTRING_LEN; } return data; } EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings); int page_pool_ethtool_stats_get_count(void) { return ARRAY_SIZE(pp_stats); } EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { const struct page_pool_stats *pool_stats = stats; *data++ = pool_stats->alloc_stats.fast; *data++ = pool_stats->alloc_stats.slow; *data++ = pool_stats->alloc_stats.slow_high_order; *data++ = pool_stats->alloc_stats.empty; *data++ = pool_stats->alloc_stats.refill; *data++ = pool_stats->alloc_stats.waive; *data++ = pool_stats->recycle_stats.cached; *data++ = pool_stats->recycle_stats.cache_full; *data++ = pool_stats->recycle_stats.ring; *data++ = pool_stats->recycle_stats.ring_full; *data++ = pool_stats->recycle_stats.released_refcnt; return data; } EXPORT_SYMBOL(page_pool_ethtool_stats_get); #else #define alloc_stat_inc(...) do { } while (0) #define recycle_stat_inc(...) do { } while (0) #define recycle_stat_add(...) do { } while (0) #endif static bool page_pool_producer_lock(struct page_pool *pool) __acquires(&pool->ring.producer_lock) { bool in_softirq = in_softirq(); if (in_softirq) spin_lock(&pool->ring.producer_lock); else spin_lock_bh(&pool->ring.producer_lock); return in_softirq; } static void page_pool_producer_unlock(struct page_pool *pool, bool in_softirq) __releases(&pool->ring.producer_lock) { if (in_softirq) spin_unlock(&pool->ring.producer_lock); else spin_unlock_bh(&pool->ring.producer_lock); } static void page_pool_struct_check(void) { CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, PAGE_POOL_FRAG_GROUP_ALIGN); } static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params, int cpuid) { unsigned int ring_qsize = 1024; /* Default */ struct netdev_rx_queue *rxq; int err; page_pool_struct_check(); memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); pool->cpuid = cpuid; pool->dma_sync_for_cpu = true; /* Validate only known flags were used */ if (pool->slow.flags & ~PP_FLAG_ALL) return -EINVAL; if (pool->p.pool_size) ring_qsize = min(pool->p.pool_size, 16384); /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, * which is the XDP_TX use-case. */ if (pool->slow.flags & PP_FLAG_DMA_MAP) { if ((pool->p.dma_dir != DMA_FROM_DEVICE) && (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; pool->dma_map = true; } if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { /* In order to request DMA-sync-for-device the page * needs to be mapped */ if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) return -EINVAL; if (!pool->p.max_len) return -EINVAL; pool->dma_sync = true; /* pool->p.offset has to be set according to the address * offset used by the DMA engine to start copying rx data */ } pool->has_init_callback = !!pool->slow.init_callback; #ifdef CONFIG_PAGE_POOL_STATS if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) { pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); if (!pool->recycle_stats) return -ENOMEM; } else { /* For system page pool instance we use a singular stats object * instead of allocating a separate percpu variable for each * (also percpu) page pool instance. */ pool->recycle_stats = &pp_system_recycle_stats; pool->system = true; } #endif if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) free_percpu(pool->recycle_stats); #endif return -ENOMEM; } atomic_set(&pool->pages_state_release_cnt, 0); /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { netdev_assert_locked(pool->slow.netdev); rxq = __netif_get_rx_queue(pool->slow.netdev, pool->slow.queue_idx); pool->mp_priv = rxq->mp_params.mp_priv; pool->mp_ops = rxq->mp_params.mp_ops; } if (pool->mp_ops) { if (!pool->dma_map || !pool->dma_sync) { err = -EOPNOTSUPP; goto free_ptr_ring; } if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { err = -EFAULT; goto free_ptr_ring; } err = pool->mp_ops->init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, err); goto free_ptr_ring; } static_branch_inc(&page_pool_mem_providers); } return 0; free_ptr_ring: ptr_ring_cleanup(&pool->ring, NULL); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) free_percpu(pool->recycle_stats); #endif return err; } static void page_pool_uninit(struct page_pool *pool) { ptr_ring_cleanup(&pool->ring, NULL); xa_destroy(&pool->dma_mapped); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) free_percpu(pool->recycle_stats); #endif } /** * page_pool_create_percpu() - create a page pool for a given cpu. * @params: parameters, see struct page_pool_params * @cpuid: cpu identifier */ struct page_pool * page_pool_create_percpu(const struct page_pool_params *params, int cpuid) { struct page_pool *pool; int err; pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid); if (!pool) return ERR_PTR(-ENOMEM); err = page_pool_init(pool, params, cpuid); if (err < 0) goto err_free; err = page_pool_list(pool); if (err) goto err_uninit; return pool; err_uninit: page_pool_uninit(pool); err_free: pr_warn("%s() gave up with errno %d\n", __func__, err); kfree(pool); return ERR_PTR(err); } EXPORT_SYMBOL(page_pool_create_percpu); /** * page_pool_create() - create a page pool * @params: parameters, see struct page_pool_params */ struct page_pool *page_pool_create(const struct page_pool_params *params) { return page_pool_create_percpu(params, -1); } EXPORT_SYMBOL(page_pool_create); static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem); static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) { struct ptr_ring *r = &pool->ring; netmem_ref netmem; int pref_nid; /* preferred NUMA node */ /* Quicker fallback, avoid locks when ring is empty */ if (__ptr_ring_empty(r)) { alloc_stat_inc(pool, empty); return 0; } /* Softirq guarantee CPU and thus NUMA node is stable. This, * assumes CPU refilling driver RX-ring will also run RX-NAPI. */ #ifdef CONFIG_NUMA pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; #else /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ #endif /* Refill alloc array, but only if NUMA match */ do { netmem = (__force netmem_ref)__ptr_ring_consume(r); if (unlikely(!netmem)) break; if (likely(netmem_is_pref_nid(netmem, pref_nid))) { pool->alloc.cache[pool->alloc.count++] = netmem; } else { /* NUMA mismatch; * (1) release 1 page to page-allocator and * (2) break out to fallthrough to alloc_pages_node. * This limit stress on page buddy alloactor. */ page_pool_return_netmem(pool, netmem); alloc_stat_inc(pool, waive); netmem = 0; break; } } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); /* Return last page */ if (likely(pool->alloc.count > 0)) { netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, refill); } return netmem; } /* fast path */ static netmem_ref __page_pool_get_cached(struct page_pool *pool) { netmem_ref netmem; /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ if (likely(pool->alloc.count)) { /* Fast-path */ netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, fast); } else { netmem = page_pool_refill_alloc_cache(pool); } return netmem; } static void __page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { #if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem); dma_sync_size = min(dma_sync_size, pool->p.max_len); __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, dma_sync_size, pool->p.dma_dir); #endif } static __always_inline void page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { rcu_read_lock(); /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ if (pool->dma_sync) __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); rcu_read_unlock(); } } static int page_pool_register_dma_index(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { int err = 0; u32 id; if (unlikely(!PP_DMA_INDEX_BITS)) goto out; if (in_softirq()) err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), PP_DMA_INDEX_LIMIT, gfp); else err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), PP_DMA_INDEX_LIMIT, gfp); if (err) { WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); goto out; } netmem_set_dma_index(netmem, id); out: return err; } static int page_pool_release_dma_index(struct page_pool *pool, netmem_ref netmem) { struct page *old, *page = netmem_to_page(netmem); unsigned long id; if (unlikely(!PP_DMA_INDEX_BITS)) return 0; id = netmem_get_dma_index(netmem); if (!id) return -1; if (in_softirq()) old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); else old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); if (old != page) return -1; netmem_set_dma_index(netmem, 0); return 0; } static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; int err; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit * into page private data (i.e 32bit cpu with 64bit DMA caps) * This mapping is kept for lifetime of page, until leaving pool. */ dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0, (PAGE_SIZE << pool->p.order), pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); if (dma_mapping_error(pool->p.dev, dma)) return false; if (page_pool_set_dma_addr_netmem(netmem, dma)) { WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); goto unmap_failed; } err = page_pool_register_dma_index(pool, netmem, gfp); if (err) goto unset_failed; page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; unset_failed: page_pool_set_dma_addr_netmem(netmem, 0); unmap_failed: dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); return false; } static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { struct page *page; gfp |= __GFP_COMP; page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); if (unlikely(!page)) return NULL; if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { put_page(page); return NULL; } alloc_stat_inc(pool, slow_high_order); page_pool_set_pp_info(pool, page_to_netmem(page)); /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, page_to_netmem(page), pool->pages_state_hold_cnt); return page; } /* slow path */ static noinline netmem_ref __page_pool_alloc_netmems_slow(struct page_pool *pool, gfp_t gfp) { const int bulk = PP_ALLOC_CACHE_REFILL; unsigned int pp_order = pool->p.order; bool dma_map = pool->dma_map; netmem_ref netmem; int i, nr_pages; /* Unconditionally set NOWARN if allocating from NAPI. * Drivers forget to set it, and OOM reports on packet Rx are useless. */ if ((gfp & GFP_ATOMIC) == GFP_ATOMIC) gfp |= __GFP_NOWARN; /* Don't support bulk alloc for high-order pages */ if (unlikely(pp_order)) return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); /* Unnecessary as alloc cache is empty, but guarantees zero count */ if (unlikely(pool->alloc.count > 0)) return pool->alloc.cache[--pool->alloc.count]; /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, (struct page **)pool->alloc.cache); if (unlikely(!nr_pages)) return 0; /* Pages have been filled into alloc.cache array, but count is zero and * page element have not been (possibly) DMA mapped. */ for (i = 0; i < nr_pages; i++) { netmem = pool->alloc.cache[i]; if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { put_page(netmem_to_page(netmem)); continue; } page_pool_set_pp_info(pool, netmem); pool->alloc.cache[pool->alloc.count++] = netmem; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); } /* Return last page */ if (likely(pool->alloc.count > 0)) { netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, slow); } else { netmem = 0; } /* When page just alloc'ed is should/must have refcnt 1. */ return netmem; } /* For using page_pool replace: alloc_pages() API calls, but provide * synchronization guarantee for allocation side. */ netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) { netmem_ref netmem; /* Fast-path: Get a page from cache */ netmem = __page_pool_get_cached(pool); if (netmem) return netmem; /* Slow-path: cache empty, do real allocation */ if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) netmem = pool->mp_ops->alloc_netmems(pool, gfp); else netmem = __page_pool_alloc_netmems_slow(pool, gfp); return netmem; } EXPORT_SYMBOL(page_pool_alloc_netmems); ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL); struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) { return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); } EXPORT_SYMBOL(page_pool_alloc_pages); /* Calculate distance between two u32 values, valid if distance is below 2^(31) * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution */ #define _distance(a, b) (s32)((a) - (b)) s32 page_pool_inflight(const struct page_pool *pool, bool strict) { u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); s32 inflight; inflight = _distance(hold_cnt, release_cnt); if (strict) { trace_page_pool_release(pool, inflight, hold_cnt, release_cnt); WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); } else { inflight = max(0, inflight); } return inflight; } void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) { netmem_set_pp(netmem, pool); netmem_or_pp_magic(netmem, PP_SIGNATURE); /* Ensuring all pages have been split into one fragment initially: * page_pool_set_pp_info() is only called once for every page when it * is allocated from the page allocator and page_pool_fragment_page() * is dirtying the same cache line as the page->pp_magic above, so * the overhead is negligible. */ page_pool_fragment_netmem(netmem, 1); if (pool->has_init_callback) pool->slow.init_callback(netmem, pool->slow.init_arg); } void page_pool_clear_pp_info(netmem_ref netmem) { netmem_clear_pp_magic(netmem); netmem_set_pp(netmem, NULL); } static __always_inline void __page_pool_release_netmem_dma(struct page_pool *pool, netmem_ref netmem) { dma_addr_t dma; if (!pool->dma_map) /* Always account for inflight pages, even if we didn't * map them */ return; if (page_pool_release_dma_index(pool, netmem)) return; dma = page_pool_get_dma_addr_netmem(netmem); /* When page is unmapped, it cannot be returned to our pool */ dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need * to disconnect a page (from a page_pool), to allow it to be used as * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem) { int count; bool put; put = true; if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) put = pool->mp_ops->release_netmem(pool, netmem); else __page_pool_release_netmem_dma(pool, netmem); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, netmem, count); if (put) { page_pool_clear_pp_info(netmem); put_page(netmem_to_page(netmem)); } /* An optimization would be to call __free_pages(page, pool->p.order) * knowing page is not part of page-cache (thus avoiding a * __page_cache_release() call). */ } static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) { bool in_softirq, ret; /* BH protection not needed if current is softirq */ in_softirq = page_pool_producer_lock(pool); ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem); if (ret) recycle_stat_inc(pool, ring); page_pool_producer_unlock(pool, in_softirq); return ret; } /* Only allow direct recycling in special circumstances, into the * alloc side cache. E.g. during RX-NAPI processing for XDP_DROP use-case. * * Caller must provide appropriate safe context. */ static bool page_pool_recycle_in_cache(netmem_ref netmem, struct page_pool *pool) { if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { recycle_stat_inc(pool, cache_full); return false; } /* Caller MUST have verified/know (page_ref_count(page) == 1) */ pool->alloc.cache[pool->alloc.count++] = netmem; recycle_stat_inc(pool, cached); return true; } static bool __page_pool_page_can_be_recycled(netmem_ref netmem) { return netmem_is_net_iov(netmem) || (page_ref_count(netmem_to_page(netmem)) == 1 && !page_is_pfmemalloc(netmem_to_page(netmem))); } /* If the page refcnt == 1, this will try to recycle the page. * If pool->dma_sync is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ static __always_inline netmem_ref __page_pool_put_page(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { lockdep_assert_no_hardirq(); /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. * * refcnt == 1 means page_pool owns page, and can recycle it. * * page is NOT reusable when allocated when system is under * some pressure. (page_is_pfmemalloc) */ if (likely(__page_pool_page_can_be_recycled(netmem))) { /* Read barrier done in page_ref_count / READ_ONCE */ page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); if (allow_direct && page_pool_recycle_in_cache(netmem, pool)) return 0; /* Page found as candidate for recycling */ return netmem; } /* Fallback/non-XDP mode: API user have elevated refcnt. * * Many drivers split up the page into fragments, and some * want to keep doing this to save memory and do refcnt based * recycling. Support this use case too, to ease drivers * switching between XDP/non-XDP. * * In-case page_pool maintains the DMA mapping, API user must * call page_pool_put_page once. In this elevated refcnt * case, the DMA is unmapped/released, as driver is likely * doing refcnt based recycle tricks, meaning another process * will be invoking put_page. */ recycle_stat_inc(pool, released_refcnt); page_pool_return_netmem(pool, netmem); return 0; } static bool page_pool_napi_local(const struct page_pool *pool) { const struct napi_struct *napi; u32 cpuid; /* On PREEMPT_RT the softirq can be preempted by the consumer */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) return false; if (unlikely(!in_softirq())) return false; /* Allow direct recycle if we have reasons to believe that we are * in the same context as the consumer would run, so there's * no possible race. * __page_pool_put_page() makes sure we're not in hardirq context * and interrupts are enabled prior to accessing the cache. */ cpuid = smp_processor_id(); if (READ_ONCE(pool->cpuid) == cpuid) return true; napi = READ_ONCE(pool->p.napi); return napi && READ_ONCE(napi->list_owner) == cpuid; } void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { if (!allow_direct) allow_direct = page_pool_napi_local(pool); netmem = __page_pool_put_page(pool, netmem, dma_sync_size, allow_direct); if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { /* Cache full, fallback to free pages */ recycle_stat_inc(pool, ring_full); page_pool_return_netmem(pool, netmem); } } EXPORT_SYMBOL(page_pool_put_unrefed_netmem); void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size, allow_direct); } EXPORT_SYMBOL(page_pool_put_unrefed_page); static void page_pool_recycle_ring_bulk(struct page_pool *pool, netmem_ref *bulk, u32 bulk_len) { bool in_softirq; u32 i; /* Bulk produce into ptr_ring page_pool cache */ in_softirq = page_pool_producer_lock(pool); for (i = 0; i < bulk_len; i++) { if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { /* ring full */ recycle_stat_inc(pool, ring_full); break; } } page_pool_producer_unlock(pool, in_softirq); recycle_stat_add(pool, ring, i); /* Hopefully all pages were returned into ptr_ring */ if (likely(i == bulk_len)) return; /* * ptr_ring cache is full, free remaining pages outside producer lock * since put_page() with refcnt == 1 can be an expensive operation. */ for (; i < bulk_len; i++) page_pool_return_netmem(pool, bulk[i]); } /** * page_pool_put_netmem_bulk() - release references on multiple netmems * @data: array holding netmem references * @count: number of entries in @data * * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk() * will release leftover netmems to the memory provider. * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx * completion loop for the XDP_REDIRECT use case. * * Please note the caller must not use data area after running * page_pool_put_netmem_bulk(), as this function overwrites it. */ void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) { u32 bulk_len = 0; for (u32 i = 0; i < count; i++) { netmem_ref netmem = netmem_compound_head(data[i]); if (page_pool_unref_and_test(netmem)) data[bulk_len++] = netmem; } count = bulk_len; while (count) { netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; struct page_pool *pool = NULL; bool allow_direct; u32 foreign = 0; bulk_len = 0; for (u32 i = 0; i < count; i++) { struct page_pool *netmem_pp; netmem_ref netmem = data[i]; netmem_pp = netmem_get_pp(netmem); if (unlikely(!pool)) { pool = netmem_pp; allow_direct = page_pool_napi_local(pool); } else if (netmem_pp != pool) { /* * If the netmem belongs to a different * page_pool, save it for another round. */ data[foreign++] = netmem; continue; } netmem = __page_pool_put_page(pool, netmem, -1, allow_direct); /* Approved for bulk recycling in ptr_ring cache */ if (netmem) bulk[bulk_len++] = netmem; } if (bulk_len) page_pool_recycle_ring_bulk(pool, bulk, bulk_len); count = foreign; } } EXPORT_SYMBOL(page_pool_put_netmem_bulk); static netmem_ref page_pool_drain_frag(struct page_pool *pool, netmem_ref netmem) { long drain_count = BIAS_MAX - pool->frag_users; /* Some user is still using the page frag */ if (likely(page_pool_unref_netmem(netmem, drain_count))) return 0; if (__page_pool_page_can_be_recycled(netmem)) { page_pool_dma_sync_for_device(pool, netmem, -1); return netmem; } page_pool_return_netmem(pool, netmem); return 0; } static void page_pool_free_frag(struct page_pool *pool) { long drain_count = BIAS_MAX - pool->frag_users; netmem_ref netmem = pool->frag_page; pool->frag_page = 0; if (!netmem || page_pool_unref_netmem(netmem, drain_count)) return; page_pool_return_netmem(pool, netmem); } netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; netmem_ref netmem = pool->frag_page; if (WARN_ON(size > max_size)) return 0; size = ALIGN(size, dma_get_cache_alignment()); *offset = pool->frag_offset; if (netmem && *offset + size > max_size) { netmem = page_pool_drain_frag(pool, netmem); if (netmem) { recycle_stat_inc(pool, cached); alloc_stat_inc(pool, fast); goto frag_reset; } } if (!netmem) { netmem = page_pool_alloc_netmems(pool, gfp); if (unlikely(!netmem)) { pool->frag_page = 0; return 0; } pool->frag_page = netmem; frag_reset: pool->frag_users = 1; *offset = 0; pool->frag_offset = size; page_pool_fragment_netmem(netmem, BIAS_MAX); return netmem; } pool->frag_users++; pool->frag_offset = *offset + size; return netmem; } EXPORT_SYMBOL(page_pool_alloc_frag_netmem); struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size, gfp_t gfp) { return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size, gfp)); } EXPORT_SYMBOL(page_pool_alloc_frag); static void page_pool_empty_ring(struct page_pool *pool) { netmem_ref netmem; /* Empty recycle ring */ while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { /* Verify the refcnt invariant of cached pages */ if (!(netmem_ref_count(netmem) == 1)) pr_crit("%s() page_pool refcnt %d violation\n", __func__, netmem_ref_count(netmem)); page_pool_return_netmem(pool, netmem); } } static void __page_pool_destroy(struct page_pool *pool) { if (pool->disconnect) pool->disconnect(pool); page_pool_unlist(pool); page_pool_uninit(pool); if (pool->mp_ops) { pool->mp_ops->destroy(pool); static_branch_dec(&page_pool_mem_providers); } kfree(pool); } static void page_pool_empty_alloc_cache_once(struct page_pool *pool) { netmem_ref netmem; if (pool->destroy_cnt) return; /* Empty alloc cache, assume caller made sure this is * no-longer in use, and page_pool_alloc_pages() cannot be * call concurrently. */ while (pool->alloc.count) { netmem = pool->alloc.cache[--pool->alloc.count]; page_pool_return_netmem(pool, netmem); } } static void page_pool_scrub(struct page_pool *pool) { unsigned long id; void *ptr; page_pool_empty_alloc_cache_once(pool); if (!pool->destroy_cnt++ && pool->dma_map) { if (pool->dma_sync) { /* Disable page_pool_dma_sync_for_device() */ pool->dma_sync = false; /* Make sure all concurrent returns that may see the old * value of dma_sync (and thus perform a sync) have * finished before doing the unmapping below. Skip the * wait if the device doesn't actually need syncing, or * if there are no outstanding mapped pages. */ if (dma_dev_need_sync(pool->p.dev) && !xa_empty(&pool->dma_mapped)) synchronize_net(); } xa_for_each(&pool->dma_mapped, id, ptr) __page_pool_release_netmem_dma(pool, page_to_netmem((struct page *)ptr)); } /* No more consumers should exist, but producers could still * be in-flight. */ page_pool_empty_ring(pool); } static int page_pool_release(struct page_pool *pool) { bool in_softirq; int inflight; page_pool_scrub(pool); inflight = page_pool_inflight(pool, true); /* Acquire producer lock to make sure producers have exited. */ in_softirq = page_pool_producer_lock(pool); page_pool_producer_unlock(pool, in_softirq); if (!inflight) __page_pool_destroy(pool); return inflight; } static void page_pool_release_retry(struct work_struct *wq) { struct delayed_work *dwq = to_delayed_work(wq); struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); void *netdev; int inflight; inflight = page_pool_release(pool); /* In rare cases, a driver bug may cause inflight to go negative. * Don't reschedule release if inflight is 0 or negative. * - If 0, the page_pool has been destroyed * - if negative, we will never recover * in both cases no reschedule is necessary. */ if (inflight <= 0) return; /* Periodic warning for page pools the user can't see */ netdev = READ_ONCE(pool->slow.netdev); if (time_after_eq(jiffies, pool->defer_warn) && (!netdev || netdev == NET_PTR_POISON)) { int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n", __func__, pool->user.id, inflight, sec); pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; } /* Still not ready to be disconnected, retry later */ schedule_delayed_work(&pool->release_dw, DEFER_TIME); } void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), const struct xdp_mem_info *mem) { refcount_inc(&pool->user_cnt); pool->disconnect = disconnect; pool->xdp_mem_id = mem->id; } /** * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI * @pool: page pool to modify * @napi: NAPI instance to associate the page pool with * * Associate a page pool with a NAPI instance for lockless page recycling. * This is useful when a new page pool has to be added to a NAPI instance * without disabling that NAPI instance, to mark the point at which control * path "hands over" the page pool to the NAPI instance. In most cases driver * can simply set the @napi field in struct page_pool_params, and does not * have to call this helper. * * The function is idempotent, but does not implement any refcounting. * Single page_pool_disable_direct_recycling() will disable recycling, * no matter how many times enable was called. */ void page_pool_enable_direct_recycling(struct page_pool *pool, struct napi_struct *napi) { if (READ_ONCE(pool->p.napi) == napi) return; WARN_ON(!napi || pool->p.napi); mutex_lock(&page_pools_lock); WRITE_ONCE(pool->p.napi, napi); mutex_unlock(&page_pools_lock); } EXPORT_SYMBOL(page_pool_enable_direct_recycling); void page_pool_disable_direct_recycling(struct page_pool *pool) { /* Disable direct recycling based on pool->cpuid. * Paired with READ_ONCE() in page_pool_napi_local(). */ WRITE_ONCE(pool->cpuid, -1); if (!pool->p.napi) return; napi_assert_will_not_race(pool->p.napi); mutex_lock(&page_pools_lock); WRITE_ONCE(pool->p.napi, NULL); mutex_unlock(&page_pools_lock); } EXPORT_SYMBOL(page_pool_disable_direct_recycling); void page_pool_destroy(struct page_pool *pool) { if (!pool) return; if (!page_pool_put(pool)) return; page_pool_disable_direct_recycling(pool); page_pool_free_frag(pool); if (!page_pool_release(pool)) return; page_pool_detached(pool); pool->defer_start = jiffies; pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); schedule_delayed_work(&pool->release_dw, DEFER_TIME); } EXPORT_SYMBOL(page_pool_destroy); /* Caller must provide appropriate safe context, e.g. NAPI. */ void page_pool_update_nid(struct page_pool *pool, int new_nid) { netmem_ref netmem; trace_page_pool_update_nid(pool, new_nid); pool->p.nid = new_nid; /* Flush pool alloc cache, as refill will check NUMA node */ while (pool->alloc.count) { netmem = pool->alloc.cache[--pool->alloc.count]; page_pool_return_netmem(pool, netmem); } } EXPORT_SYMBOL(page_pool_update_nid); bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) { return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); } /* Associate a niov with a page pool. Should follow with a matching * net_mp_niov_clear_page_pool() */ void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) { netmem_ref netmem = net_iov_to_netmem(niov); page_pool_set_pp_info(pool, netmem); pool->pages_state_hold_cnt++; trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); } /* Disassociate a niov from a page pool. Should only be used in the * ->release_netmem() path. */ void net_mp_niov_clear_page_pool(struct net_iov *niov) { netmem_ref netmem = net_iov_to_netmem(niov); page_pool_clear_pp_info(netmem); } |
| 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Mirics MSi001 silicon tuner driver * * Copyright (C) 2013 Antti Palosaari <crope@iki.fi> * Copyright (C) 2014 Antti Palosaari <crope@iki.fi> */ #include <linux/module.h> #include <linux/gcd.h> #include <media/v4l2-device.h> #include <media/v4l2-ctrls.h> static const struct v4l2_frequency_band bands[] = { { .type = V4L2_TUNER_RF, .index = 0, .capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS, .rangelow = 49000000, .rangehigh = 263000000, }, { .type = V4L2_TUNER_RF, .index = 1, .capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS, .rangelow = 390000000, .rangehigh = 960000000, }, }; struct msi001_dev { struct spi_device *spi; struct v4l2_subdev sd; /* Controls */ struct v4l2_ctrl_handler hdl; struct v4l2_ctrl *bandwidth_auto; struct v4l2_ctrl *bandwidth; struct v4l2_ctrl *lna_gain; struct v4l2_ctrl *mixer_gain; struct v4l2_ctrl *if_gain; unsigned int f_tuner; }; static inline struct msi001_dev *sd_to_msi001_dev(struct v4l2_subdev *sd) { return container_of(sd, struct msi001_dev, sd); } static int msi001_wreg(struct msi001_dev *dev, u32 data) { /* Register format: 4 bits addr + 20 bits value */ return spi_write(dev->spi, &data, 3); }; static int msi001_set_gain(struct msi001_dev *dev, int lna_gain, int mixer_gain, int if_gain) { struct spi_device *spi = dev->spi; int ret; u32 reg; dev_dbg(&spi->dev, "lna=%d mixer=%d if=%d\n", lna_gain, mixer_gain, if_gain); reg = 1 << 0; reg |= (59 - if_gain) << 4; reg |= 0 << 10; reg |= (1 - mixer_gain) << 12; reg |= (1 - lna_gain) << 13; reg |= 4 << 14; reg |= 0 << 17; ret = msi001_wreg(dev, reg); if (ret) goto err; return 0; err: dev_dbg(&spi->dev, "failed %d\n", ret); return ret; }; static int msi001_set_tuner(struct msi001_dev *dev) { struct spi_device *spi = dev->spi; int ret, i; unsigned int uitmp, div_n, k, k_thresh, k_frac, div_lo, f_if1; u32 reg; u64 f_vco; u8 mode, filter_mode; static const struct { u32 rf; u8 mode; u8 div_lo; } band_lut[] = { { 50000000, 0xe1, 16}, /* AM_MODE2, antenna 2 */ {108000000, 0x42, 32}, /* VHF_MODE */ {330000000, 0x44, 16}, /* B3_MODE */ {960000000, 0x48, 4}, /* B45_MODE */ { ~0U, 0x50, 2}, /* BL_MODE */ }; static const struct { u32 freq; u8 filter_mode; } if_freq_lut[] = { { 0, 0x03}, /* Zero IF */ { 450000, 0x02}, /* 450 kHz IF */ {1620000, 0x01}, /* 1.62 MHz IF */ {2048000, 0x00}, /* 2.048 MHz IF */ }; static const struct { u32 freq; u8 val; } bandwidth_lut[] = { { 200000, 0x00}, /* 200 kHz */ { 300000, 0x01}, /* 300 kHz */ { 600000, 0x02}, /* 600 kHz */ {1536000, 0x03}, /* 1.536 MHz */ {5000000, 0x04}, /* 5 MHz */ {6000000, 0x05}, /* 6 MHz */ {7000000, 0x06}, /* 7 MHz */ {8000000, 0x07}, /* 8 MHz */ }; unsigned int f_rf = dev->f_tuner; /* * bandwidth (Hz) * 200000, 300000, 600000, 1536000, 5000000, 6000000, 7000000, 8000000 */ unsigned int bandwidth; /* * intermediate frequency (Hz) * 0, 450000, 1620000, 2048000 */ unsigned int f_if = 0; #define F_REF 24000000 #define DIV_PRE_N 4 #define F_VCO_STEP div_lo dev_dbg(&spi->dev, "f_rf=%d f_if=%d\n", f_rf, f_if); for (i = 0; i < ARRAY_SIZE(band_lut); i++) { if (f_rf <= band_lut[i].rf) { mode = band_lut[i].mode; div_lo = band_lut[i].div_lo; break; } } if (i == ARRAY_SIZE(band_lut)) { ret = -EINVAL; goto err; } /* AM_MODE is upconverted */ if ((mode >> 0) & 0x1) f_if1 = 5 * F_REF; else f_if1 = 0; for (i = 0; i < ARRAY_SIZE(if_freq_lut); i++) { if (f_if == if_freq_lut[i].freq) { filter_mode = if_freq_lut[i].filter_mode; break; } } if (i == ARRAY_SIZE(if_freq_lut)) { ret = -EINVAL; goto err; } /* filters */ bandwidth = dev->bandwidth->val; bandwidth = clamp(bandwidth, 200000U, 8000000U); for (i = 0; i < ARRAY_SIZE(bandwidth_lut); i++) { if (bandwidth <= bandwidth_lut[i].freq) { bandwidth = bandwidth_lut[i].val; break; } } if (i == ARRAY_SIZE(bandwidth_lut)) { ret = -EINVAL; goto err; } dev->bandwidth->val = bandwidth_lut[i].freq; dev_dbg(&spi->dev, "bandwidth selected=%d\n", bandwidth_lut[i].freq); /* * Fractional-N synthesizer * * +---------------------------------------+ * v | * Fref +----+ +-------+ +----+ +------+ +---+ * ------> | PD | --> | VCO | ------> | /4 | --> | /N.F | <-- | K | * +----+ +-------+ +----+ +------+ +---+ * | * | * v * +-------+ Fout * | /Rout | ------> * +-------+ */ /* Calculate PLL integer and fractional control word. */ f_vco = (u64) (f_rf + f_if + f_if1) * div_lo; div_n = div_u64_rem(f_vco, DIV_PRE_N * F_REF, &k); k_thresh = (DIV_PRE_N * F_REF) / F_VCO_STEP; k_frac = div_u64((u64) k * k_thresh, (DIV_PRE_N * F_REF)); /* Find out greatest common divisor and divide to smaller. */ uitmp = gcd(k_thresh, k_frac); k_thresh /= uitmp; k_frac /= uitmp; /* Force divide to reg max. Resolution will be reduced. */ uitmp = DIV_ROUND_UP(k_thresh, 4095); k_thresh = DIV_ROUND_CLOSEST(k_thresh, uitmp); k_frac = DIV_ROUND_CLOSEST(k_frac, uitmp); /* Calculate real RF set. */ uitmp = (unsigned int) F_REF * DIV_PRE_N * div_n; uitmp += (unsigned int) F_REF * DIV_PRE_N * k_frac / k_thresh; uitmp /= div_lo; dev_dbg(&spi->dev, "f_rf=%u:%u f_vco=%llu div_n=%u k_thresh=%u k_frac=%u div_lo=%u\n", f_rf, uitmp, f_vco, div_n, k_thresh, k_frac, div_lo); ret = msi001_wreg(dev, 0x00000e); if (ret) goto err; ret = msi001_wreg(dev, 0x000003); if (ret) goto err; reg = 0 << 0; reg |= mode << 4; reg |= filter_mode << 12; reg |= bandwidth << 14; reg |= 0x02 << 17; reg |= 0x00 << 20; ret = msi001_wreg(dev, reg); if (ret) goto err; reg = 5 << 0; reg |= k_thresh << 4; reg |= 1 << 19; reg |= 1 << 21; ret = msi001_wreg(dev, reg); if (ret) goto err; reg = 2 << 0; reg |= k_frac << 4; reg |= div_n << 16; ret = msi001_wreg(dev, reg); if (ret) goto err; ret = msi001_set_gain(dev, dev->lna_gain->cur.val, dev->mixer_gain->cur.val, dev->if_gain->cur.val); if (ret) goto err; reg = 6 << 0; reg |= 63 << 4; reg |= 4095 << 10; ret = msi001_wreg(dev, reg); if (ret) goto err; return 0; err: dev_dbg(&spi->dev, "failed %d\n", ret); return ret; } static int msi001_standby(struct v4l2_subdev *sd) { struct msi001_dev *dev = sd_to_msi001_dev(sd); return msi001_wreg(dev, 0x000000); } static int msi001_g_tuner(struct v4l2_subdev *sd, struct v4l2_tuner *v) { struct msi001_dev *dev = sd_to_msi001_dev(sd); struct spi_device *spi = dev->spi; dev_dbg(&spi->dev, "index=%d\n", v->index); strscpy(v->name, "Mirics MSi001", sizeof(v->name)); v->type = V4L2_TUNER_RF; v->capability = V4L2_TUNER_CAP_1HZ | V4L2_TUNER_CAP_FREQ_BANDS; v->rangelow = 49000000; v->rangehigh = 960000000; return 0; } static int msi001_s_tuner(struct v4l2_subdev *sd, const struct v4l2_tuner *v) { struct msi001_dev *dev = sd_to_msi001_dev(sd); struct spi_device *spi = dev->spi; dev_dbg(&spi->dev, "index=%d\n", v->index); return 0; } static int msi001_g_frequency(struct v4l2_subdev *sd, struct v4l2_frequency *f) { struct msi001_dev *dev = sd_to_msi001_dev(sd); struct spi_device *spi = dev->spi; dev_dbg(&spi->dev, "tuner=%d\n", f->tuner); f->frequency = dev->f_tuner; return 0; } static int msi001_s_frequency(struct v4l2_subdev *sd, const struct v4l2_frequency *f) { struct msi001_dev *dev = sd_to_msi001_dev(sd); struct spi_device *spi = dev->spi; unsigned int band; dev_dbg(&spi->dev, "tuner=%d type=%d frequency=%u\n", f->tuner, f->type, f->frequency); if (f->frequency < ((bands[0].rangehigh + bands[1].rangelow) / 2)) band = 0; else band = 1; dev->f_tuner = clamp_t(unsigned int, f->frequency, bands[band].rangelow, bands[band].rangehigh); return msi001_set_tuner(dev); } static int msi001_enum_freq_bands(struct v4l2_subdev *sd, struct v4l2_frequency_band *band) { struct msi001_dev *dev = sd_to_msi001_dev(sd); struct spi_device *spi = dev->spi; dev_dbg(&spi->dev, "tuner=%d type=%d index=%d\n", band->tuner, band->type, band->index); if (band->index >= ARRAY_SIZE(bands)) return -EINVAL; band->capability = bands[band->index].capability; band->rangelow = bands[band->index].rangelow; band->rangehigh = bands[band->index].rangehigh; return 0; } static const struct v4l2_subdev_tuner_ops msi001_tuner_ops = { .standby = msi001_standby, .g_tuner = msi001_g_tuner, .s_tuner = msi001_s_tuner, .g_frequency = msi001_g_frequency, .s_frequency = msi001_s_frequency, .enum_freq_bands = msi001_enum_freq_bands, }; static const struct v4l2_subdev_ops msi001_ops = { .tuner = &msi001_tuner_ops, }; static int msi001_s_ctrl(struct v4l2_ctrl *ctrl) { struct msi001_dev *dev = container_of(ctrl->handler, struct msi001_dev, hdl); struct spi_device *spi = dev->spi; int ret; dev_dbg(&spi->dev, "id=%d name=%s val=%d min=%lld max=%lld step=%lld\n", ctrl->id, ctrl->name, ctrl->val, ctrl->minimum, ctrl->maximum, ctrl->step); switch (ctrl->id) { case V4L2_CID_RF_TUNER_BANDWIDTH_AUTO: case V4L2_CID_RF_TUNER_BANDWIDTH: ret = msi001_set_tuner(dev); break; case V4L2_CID_RF_TUNER_LNA_GAIN: ret = msi001_set_gain(dev, dev->lna_gain->val, dev->mixer_gain->cur.val, dev->if_gain->cur.val); break; case V4L2_CID_RF_TUNER_MIXER_GAIN: ret = msi001_set_gain(dev, dev->lna_gain->cur.val, dev->mixer_gain->val, dev->if_gain->cur.val); break; case V4L2_CID_RF_TUNER_IF_GAIN: ret = msi001_set_gain(dev, dev->lna_gain->cur.val, dev->mixer_gain->cur.val, dev->if_gain->val); break; default: dev_dbg(&spi->dev, "unknown control %d\n", ctrl->id); ret = -EINVAL; } return ret; } static const struct v4l2_ctrl_ops msi001_ctrl_ops = { .s_ctrl = msi001_s_ctrl, }; static int msi001_probe(struct spi_device *spi) { struct msi001_dev *dev; int ret; dev_dbg(&spi->dev, "\n"); dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { ret = -ENOMEM; goto err; } dev->spi = spi; dev->f_tuner = bands[0].rangelow; v4l2_spi_subdev_init(&dev->sd, spi, &msi001_ops); /* Register controls */ v4l2_ctrl_handler_init(&dev->hdl, 5); dev->bandwidth_auto = v4l2_ctrl_new_std(&dev->hdl, &msi001_ctrl_ops, V4L2_CID_RF_TUNER_BANDWIDTH_AUTO, 0, 1, 1, 1); dev->bandwidth = v4l2_ctrl_new_std(&dev->hdl, &msi001_ctrl_ops, V4L2_CID_RF_TUNER_BANDWIDTH, 200000, 8000000, 1, 200000); if (dev->hdl.error) { ret = dev->hdl.error; dev_err(&spi->dev, "Could not initialize controls\n"); /* control init failed, free handler */ goto err_ctrl_handler_free; } v4l2_ctrl_auto_cluster(2, &dev->bandwidth_auto, 0, false); dev->lna_gain = v4l2_ctrl_new_std(&dev->hdl, &msi001_ctrl_ops, V4L2_CID_RF_TUNER_LNA_GAIN, 0, 1, 1, 1); dev->mixer_gain = v4l2_ctrl_new_std(&dev->hdl, &msi001_ctrl_ops, V4L2_CID_RF_TUNER_MIXER_GAIN, 0, 1, 1, 1); dev->if_gain = v4l2_ctrl_new_std(&dev->hdl, &msi001_ctrl_ops, V4L2_CID_RF_TUNER_IF_GAIN, 0, 59, 1, 0); if (dev->hdl.error) { ret = dev->hdl.error; dev_err(&spi->dev, "Could not initialize controls\n"); /* control init failed, free handler */ goto err_ctrl_handler_free; } dev->sd.ctrl_handler = &dev->hdl; return 0; err_ctrl_handler_free: v4l2_ctrl_handler_free(&dev->hdl); kfree(dev); err: return ret; } static void msi001_remove(struct spi_device *spi) { struct v4l2_subdev *sd = spi_get_drvdata(spi); struct msi001_dev *dev = sd_to_msi001_dev(sd); dev_dbg(&spi->dev, "\n"); /* * Registered by v4l2_spi_new_subdev() from master driver, but we must * unregister it from here. Weird. */ v4l2_device_unregister_subdev(&dev->sd); v4l2_ctrl_handler_free(&dev->hdl); kfree(dev); } static const struct spi_device_id msi001_id_table[] = { {"msi001", 0}, {} }; MODULE_DEVICE_TABLE(spi, msi001_id_table); static struct spi_driver msi001_driver = { .driver = { .name = "msi001", .suppress_bind_attrs = true, }, .probe = msi001_probe, .remove = msi001_remove, .id_table = msi001_id_table, }; module_spi_driver(msi001_driver); MODULE_AUTHOR("Antti Palosaari <crope@iki.fi>"); MODULE_DESCRIPTION("Mirics MSi001"); MODULE_LICENSE("GPL"); |
| 2 3 1 2 1 2 2 2 2 1 1 1 3 3 7 5 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/firmware.h> #include <linux/sfp.h> #include <net/devlink.h> #include <net/netdev_lock.h> #include "netlink.h" #include "common.h" #include "bitset.h" #include "module_fw.h" struct module_req_info { struct ethnl_req_info base; }; struct module_reply_data { struct ethnl_reply_data base; struct ethtool_module_power_mode_params power; }; #define MODULE_REPDATA(__reply_base) \ container_of(__reply_base, struct module_reply_data, base) /* MODULE_GET */ const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1] = { [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int module_get_power_mode(struct net_device *dev, struct module_reply_data *data, struct netlink_ext_ack *extack) { const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->get_module_power_mode) return 0; if (dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(extack, "Module firmware flashing is in progress"); return -EBUSY; } return ops->get_module_power_mode(dev, &data->power, extack); } static int module_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct module_reply_data *data = MODULE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = module_get_power_mode(dev, data, info->extack); if (ret < 0) goto out_complete; out_complete: ethnl_ops_complete(dev); return ret; } static int module_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { struct module_reply_data *data = MODULE_REPDATA(reply_base); int len = 0; if (data->power.policy) len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE_POLICY */ if (data->power.mode) len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE */ return len; } static int module_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct module_reply_data *data = MODULE_REPDATA(reply_base); if (data->power.policy && nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE_POLICY, data->power.policy)) return -EMSGSIZE; if (data->power.mode && nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE, data->power.mode)) return -EMSGSIZE; return 0; } /* MODULE_SET */ const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1] = { [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_MODULE_POWER_MODE_POLICY] = NLA_POLICY_RANGE(NLA_U8, ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH, ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO), }; static int ethnl_set_module_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; struct nlattr **tb = info->attrs; if (!tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]) return 0; if (req_info->dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(info->extack, "Module firmware flashing is in progress"); return -EBUSY; } if (!ops->get_module_power_mode || !ops->set_module_power_mode) { NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY], "Setting power mode policy is not supported by this device"); return -EOPNOTSUPP; } return 1; } static int ethnl_set_module(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_module_power_mode_params power = {}; struct ethtool_module_power_mode_params power_new; const struct ethtool_ops *ops; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; int ret; ops = dev->ethtool_ops; power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]); ret = ops->get_module_power_mode(dev, &power, info->extack); if (ret < 0) return ret; if (power_new.policy == power.policy) return 0; ret = ops->set_module_power_mode(dev, &power_new, info->extack); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_module_request_ops = { .request_cmd = ETHTOOL_MSG_MODULE_GET, .reply_cmd = ETHTOOL_MSG_MODULE_GET_REPLY, .hdr_attr = ETHTOOL_A_MODULE_HEADER, .req_info_size = sizeof(struct module_req_info), .reply_data_size = sizeof(struct module_reply_data), .prepare_data = module_prepare_data, .reply_size = module_reply_size, .fill_reply = module_fill_reply, .set_validate = ethnl_set_module_validate, .set = ethnl_set_module, .set_ntf_cmd = ETHTOOL_MSG_MODULE_NTF, }; /* MODULE_FW_FLASH_ACT */ const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1] = { [ETHTOOL_A_MODULE_FW_FLASH_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME] = { .type = NLA_NUL_STRING }, [ETHTOOL_A_MODULE_FW_FLASH_PASSWORD] = { .type = NLA_U32 }, }; static LIST_HEAD(module_fw_flash_work_list); static DEFINE_SPINLOCK(module_fw_flash_work_list_lock); static int module_flash_fw_work_list_add(struct ethtool_module_fw_flash *module_fw, struct genl_info *info) { struct ethtool_module_fw_flash *work; /* First, check if already registered. */ spin_lock(&module_fw_flash_work_list_lock); list_for_each_entry(work, &module_fw_flash_work_list, list) { if (work->fw_update.ntf_params.portid == info->snd_portid && work->fw_update.dev == module_fw->fw_update.dev) { spin_unlock(&module_fw_flash_work_list_lock); return -EALREADY; } } list_add_tail(&module_fw->list, &module_fw_flash_work_list); spin_unlock(&module_fw_flash_work_list_lock); return 0; } static void module_flash_fw_work_list_del(struct list_head *list) { spin_lock(&module_fw_flash_work_list_lock); list_del(list); spin_unlock(&module_fw_flash_work_list_lock); } static void module_flash_fw_work(struct work_struct *work) { struct ethtool_module_fw_flash *module_fw; module_fw = container_of(work, struct ethtool_module_fw_flash, work); ethtool_cmis_fw_update(&module_fw->fw_update); module_flash_fw_work_list_del(&module_fw->list); module_fw->fw_update.dev->ethtool->module_fw_flash_in_progress = false; netdev_put(module_fw->fw_update.dev, &module_fw->dev_tracker); release_firmware(module_fw->fw_update.fw); kfree(module_fw); } #define MODULE_EEPROM_PHYS_ID_PAGE 0 #define MODULE_EEPROM_PHYS_ID_I2C_ADDR 0x50 static int module_flash_fw_work_init(struct ethtool_module_fw_flash *module_fw, struct net_device *dev, struct netlink_ext_ack *extack) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_module_eeprom page_data = {}; u8 phys_id; int err; /* Fetch the SFF-8024 Identifier Value. For all supported standards, it * is located at I2C address 0x50, byte 0. See section 4.1 in SFF-8024, * revision 4.9. */ page_data.page = MODULE_EEPROM_PHYS_ID_PAGE; page_data.offset = SFP_PHYS_ID; page_data.length = sizeof(phys_id); page_data.i2c_address = MODULE_EEPROM_PHYS_ID_I2C_ADDR; page_data.data = &phys_id; err = ops->get_module_eeprom_by_page(dev, &page_data, extack); if (err < 0) return err; switch (phys_id) { case SFF8024_ID_QSFP_DD: case SFF8024_ID_OSFP: case SFF8024_ID_DSFP: case SFF8024_ID_QSFP_PLUS_CMIS: case SFF8024_ID_SFP_DD_CMIS: case SFF8024_ID_SFP_PLUS_CMIS: INIT_WORK(&module_fw->work, module_flash_fw_work); break; default: NL_SET_ERR_MSG(extack, "Module type does not support firmware flashing"); return -EOPNOTSUPP; } return 0; } void ethnl_module_fw_flash_sock_destroy(struct ethnl_sock_priv *sk_priv) { struct ethtool_module_fw_flash *work; spin_lock(&module_fw_flash_work_list_lock); list_for_each_entry(work, &module_fw_flash_work_list, list) { if (work->fw_update.dev == sk_priv->dev && work->fw_update.ntf_params.portid == sk_priv->portid) { work->fw_update.ntf_params.closed_sock = true; break; } } spin_unlock(&module_fw_flash_work_list_lock); } static int module_flash_fw_schedule(struct net_device *dev, const char *file_name, struct ethtool_module_fw_flash_params *params, struct sk_buff *skb, struct genl_info *info) { struct ethtool_cmis_fw_update_params *fw_update; struct ethtool_module_fw_flash *module_fw; int err; module_fw = kzalloc(sizeof(*module_fw), GFP_KERNEL); if (!module_fw) return -ENOMEM; fw_update = &module_fw->fw_update; fw_update->params = *params; err = request_firmware_direct(&fw_update->fw, file_name, &dev->dev); if (err) { NL_SET_ERR_MSG(info->extack, "Failed to request module firmware image"); goto err_free; } err = module_flash_fw_work_init(module_fw, dev, info->extack); if (err < 0) goto err_release_firmware; dev->ethtool->module_fw_flash_in_progress = true; netdev_hold(dev, &module_fw->dev_tracker, GFP_KERNEL); fw_update->dev = dev; fw_update->ntf_params.portid = info->snd_portid; fw_update->ntf_params.seq = info->snd_seq; fw_update->ntf_params.closed_sock = false; err = ethnl_sock_priv_set(skb, dev, fw_update->ntf_params.portid, ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH); if (err < 0) goto err_release_firmware; err = module_flash_fw_work_list_add(module_fw, info); if (err < 0) goto err_release_firmware; schedule_work(&module_fw->work); return 0; err_release_firmware: release_firmware(fw_update->fw); err_free: kfree(module_fw); return err; } static int module_flash_fw(struct net_device *dev, struct nlattr **tb, struct sk_buff *skb, struct genl_info *info) { struct ethtool_module_fw_flash_params params = {}; const char *file_name; struct nlattr *attr; if (GENL_REQ_ATTR_CHECK(info, ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME)) return -EINVAL; file_name = nla_data(tb[ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME]); attr = tb[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD]; if (attr) { params.password = cpu_to_be32(nla_get_u32(attr)); params.password_valid = true; } return module_flash_fw_schedule(dev, file_name, ¶ms, skb, info); } static int ethnl_module_fw_flash_validate(struct net_device *dev, struct netlink_ext_ack *extack) { struct devlink_port *devlink_port = dev->devlink_port; const struct ethtool_ops *ops = dev->ethtool_ops; if (!ops->set_module_eeprom_by_page || !ops->get_module_eeprom_by_page) { NL_SET_ERR_MSG(extack, "Flashing module firmware is not supported by this device"); return -EOPNOTSUPP; } if (!ops->reset) { NL_SET_ERR_MSG(extack, "Reset module is not supported by this device, so flashing is not permitted"); return -EOPNOTSUPP; } if (dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(extack, "Module firmware flashing already in progress"); return -EBUSY; } if (dev->flags & IFF_UP) { NL_SET_ERR_MSG(extack, "Netdevice is up, so flashing is not permitted"); return -EBUSY; } if (devlink_port && devlink_port->attrs.split) { NL_SET_ERR_MSG(extack, "Can't perform firmware flashing on a split port"); return -EOPNOTSUPP; } return 0; } int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info) { struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; struct net_device *dev; int ret; ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_MODULE_FW_FLASH_HEADER], genl_info_net(info), info->extack, true); if (ret < 0) return ret; dev = req_info.dev; rtnl_lock(); netdev_lock_ops(dev); ret = ethnl_ops_begin(dev); if (ret < 0) goto out_unlock; ret = ethnl_module_fw_flash_validate(dev, info->extack); if (ret < 0) goto out_unlock; ret = module_flash_fw(dev, tb, skb, info); ethnl_ops_complete(dev); out_unlock: netdev_unlock_ops(dev); rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); return ret; } /* MODULE_FW_FLASH_NTF */ static int ethnl_module_fw_flash_ntf_put_err(struct sk_buff *skb, char *err_msg, char *sub_err_msg) { int err_msg_len, sub_err_msg_len, total_len; struct nlattr *attr; if (!err_msg) return 0; err_msg_len = strlen(err_msg); total_len = err_msg_len + 2; /* For period and NUL. */ if (sub_err_msg) { sub_err_msg_len = strlen(sub_err_msg); total_len += sub_err_msg_len + 2; /* For ", ". */ } attr = nla_reserve(skb, ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG, total_len); if (!attr) return -ENOMEM; if (sub_err_msg) sprintf(nla_data(attr), "%s, %s.", err_msg, sub_err_msg); else sprintf(nla_data(attr), "%s.", err_msg); return 0; } static void ethnl_module_fw_flash_ntf(struct net_device *dev, enum ethtool_module_fw_flash_status status, struct ethnl_module_fw_flash_ntf_params *ntf_params, char *err_msg, char *sub_err_msg, u64 done, u64 total) { struct sk_buff *skb; void *hdr; int ret; if (ntf_params->closed_sock) return; skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return; hdr = ethnl_unicast_put(skb, ntf_params->portid, ++ntf_params->seq, ETHTOOL_MSG_MODULE_FW_FLASH_NTF); if (!hdr) goto err_skb; ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_MODULE_FW_FLASH_HEADER); if (ret < 0) goto err_skb; if (nla_put_u32(skb, ETHTOOL_A_MODULE_FW_FLASH_STATUS, status)) goto err_skb; ret = ethnl_module_fw_flash_ntf_put_err(skb, err_msg, sub_err_msg); if (ret < 0) goto err_skb; if (nla_put_uint(skb, ETHTOOL_A_MODULE_FW_FLASH_DONE, done)) goto err_skb; if (nla_put_uint(skb, ETHTOOL_A_MODULE_FW_FLASH_TOTAL, total)) goto err_skb; genlmsg_end(skb, hdr); genlmsg_unicast(dev_net(dev), skb, ntf_params->portid); return; err_skb: nlmsg_free(skb); } void ethnl_module_fw_flash_ntf_err(struct net_device *dev, struct ethnl_module_fw_flash_ntf_params *params, char *err_msg, char *sub_err_msg) { ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_ERROR, params, err_msg, sub_err_msg, 0, 0); } void ethnl_module_fw_flash_ntf_start(struct net_device *dev, struct ethnl_module_fw_flash_ntf_params *params) { ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_STARTED, params, NULL, NULL, 0, 0); } void ethnl_module_fw_flash_ntf_complete(struct net_device *dev, struct ethnl_module_fw_flash_ntf_params *params) { ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_COMPLETED, params, NULL, NULL, 0, 0); } void ethnl_module_fw_flash_ntf_in_progress(struct net_device *dev, struct ethnl_module_fw_flash_ntf_params *params, u64 done, u64 total) { ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_IN_PROGRESS, params, NULL, NULL, done, total); } |
| 14 14 17 16 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | /* * Copyright 2011 Red Hat, Inc. * Copyright © 2014 The Chromium OS Authors * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software") * to deal in the software without restriction, including without limitation * on the rights to use, copy, modify, merge, publish, distribute, sub * license, and/or sell copies of the Software, and to permit persons to whom * them Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTIBILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER * IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * Authors: * Adam Jackson <ajax@redhat.com> * Ben Widawsky <ben@bwidawsk.net> */ /* * This is vgem, a (non-hardware-backed) GEM service. This is used by Mesa's * software renderer and the X server for efficient buffer sharing. */ #include <linux/dma-buf.h> #include <linux/module.h> #include <linux/device/faux.h> #include <linux/shmem_fs.h> #include <linux/vmalloc.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> #include <drm/drm_gem_shmem_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> #include <drm/drm_prime.h> #include "vgem_drv.h" #define DRIVER_NAME "vgem" #define DRIVER_DESC "Virtual GEM provider" #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0 static struct vgem_device { struct drm_device drm; struct faux_device *faux_dev; } *vgem_device; static int vgem_open(struct drm_device *dev, struct drm_file *file) { struct vgem_file *vfile; int ret; vfile = kzalloc(sizeof(*vfile), GFP_KERNEL); if (!vfile) return -ENOMEM; file->driver_priv = vfile; ret = vgem_fence_open(vfile); if (ret) { kfree(vfile); return ret; } return 0; } static void vgem_postclose(struct drm_device *dev, struct drm_file *file) { struct vgem_file *vfile = file->driver_priv; vgem_fence_close(vfile); kfree(vfile); } static struct drm_ioctl_desc vgem_ioctls[] = { DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_RENDER_ALLOW), }; DEFINE_DRM_GEM_FOPS(vgem_driver_fops); static struct drm_gem_object *vgem_gem_create_object(struct drm_device *dev, size_t size) { struct drm_gem_shmem_object *obj; obj = kzalloc(sizeof(*obj), GFP_KERNEL); if (!obj) return ERR_PTR(-ENOMEM); /* * vgem doesn't have any begin/end cpu access ioctls, therefore must use * coherent memory or dma-buf sharing just wont work. */ obj->map_wc = true; return &obj->base; } static const struct drm_driver vgem_driver = { .driver_features = DRIVER_GEM | DRIVER_RENDER, .open = vgem_open, .postclose = vgem_postclose, .ioctls = vgem_ioctls, .num_ioctls = ARRAY_SIZE(vgem_ioctls), .fops = &vgem_driver_fops, DRM_GEM_SHMEM_DRIVER_OPS, .gem_create_object = vgem_gem_create_object, .name = DRIVER_NAME, .desc = DRIVER_DESC, .major = DRIVER_MAJOR, .minor = DRIVER_MINOR, }; static int __init vgem_init(void) { int ret; struct faux_device *fdev; fdev = faux_device_create("vgem", NULL, NULL); if (!fdev) return -ENODEV; if (!devres_open_group(&fdev->dev, NULL, GFP_KERNEL)) { ret = -ENOMEM; goto out_unregister; } dma_coerce_mask_and_coherent(&fdev->dev, DMA_BIT_MASK(64)); vgem_device = devm_drm_dev_alloc(&fdev->dev, &vgem_driver, struct vgem_device, drm); if (IS_ERR(vgem_device)) { ret = PTR_ERR(vgem_device); goto out_devres; } vgem_device->faux_dev = fdev; /* Final step: expose the device/driver to userspace */ ret = drm_dev_register(&vgem_device->drm, 0); if (ret) goto out_devres; return 0; out_devres: devres_release_group(&fdev->dev, NULL); out_unregister: faux_device_destroy(fdev); return ret; } static void __exit vgem_exit(void) { struct faux_device *fdev = vgem_device->faux_dev; drm_dev_unregister(&vgem_device->drm); devres_release_group(&fdev->dev, NULL); faux_device_destroy(fdev); } module_init(vgem_init); module_exit(vgem_exit); MODULE_AUTHOR("Red Hat, Inc."); MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL and additional rights"); |
| 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | // SPDX-License-Identifier: GPL-2.0-only /* * HID driver for Aureal Cy se W-01RN USB_V3.1 devices * * Copyright (c) 2010 Franco Catrin <fcatrin@gmail.com> * Copyright (c) 2010 Ben Cropley <bcropley@internode.on.net> * * Based on HID sunplus driver by * Copyright (c) 1999 Andreas Gal * Copyright (c) 2000-2005 Vojtech Pavlik <vojtech@suse.cz> * Copyright (c) 2005 Michael Haboustak <mike-@cinci.rr.com> for Concept2, Inc * Copyright (c) 2006-2007 Jiri Kosina * Copyright (c) 2008 Jiri Slaby */ #include <linux/device.h> #include <linux/hid.h> #include <linux/module.h> #include "hid-ids.h" static const __u8 *aureal_report_fixup(struct hid_device *hdev, __u8 *rdesc, unsigned int *rsize) { if (*rsize >= 54 && rdesc[52] == 0x25 && rdesc[53] == 0x01) { dev_info(&hdev->dev, "fixing Aureal Cy se W-01RN USB_V3.1 report descriptor.\n"); rdesc[53] = 0x65; } return rdesc; } static const struct hid_device_id aureal_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_AUREAL, USB_DEVICE_ID_AUREAL_W01RN) }, { } }; MODULE_DEVICE_TABLE(hid, aureal_devices); static struct hid_driver aureal_driver = { .name = "aureal", .id_table = aureal_devices, .report_fixup = aureal_report_fixup, }; module_hid_driver(aureal_driver); MODULE_DESCRIPTION("HID driver for Aureal Cy se W-01RN USB_V3.1 devices"); MODULE_LICENSE("GPL"); |
| 3 3 2 3 3 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright 2019 ARM Ltd. * * Generic implementation of update_vsyscall and update_vsyscall_tz. * * Based on the x86 specific implementation. */ #include <linux/hrtimer.h> #include <linux/timekeeper_internal.h> #include <vdso/datapage.h> #include <vdso/helpers.h> #include <vdso/vsyscall.h> #include "timekeeping_internal.h" static inline void fill_clock_configuration(struct vdso_clock *vc, const struct tk_read_base *base) { vc->cycle_last = base->cycle_last; #ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT vc->max_cycles = base->clock->max_cycles; #endif vc->mask = base->mask; vc->mult = base->mult; vc->shift = base->shift; } static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) { struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; fill_clock_configuration(&vc[CS_HRES_COARSE], &tk->tkr_mono); fill_clock_configuration(&vc[CS_RAW], &tk->tkr_raw); /* CLOCK_MONOTONIC */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* Copy MONOTONIC time for BOOTTIME */ sec = vdso_ts->sec; /* Add the boot offset */ sec += tk->monotonic_to_boot.tv_sec; nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); vdso_ts->sec++; } vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { struct vdso_time_data *vdata = vdso_k_time_data; struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; /* copy vsyscall data */ vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; vc[CS_HRES_COARSE].clock_mode = clock_mode; vc[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->coarse_nsec; /* CLOCK_MONOTONIC_COARSE */ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->coarse_nsec; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); /* * Read without the seqlock held by clock_getres(). */ WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) update_vdso_time_data(vdata, tk); __arch_update_vdso_clock(&vc[CS_HRES_COARSE]); __arch_update_vdso_clock(&vc[CS_RAW]); vdso_write_end(vdata); __arch_sync_vdso_time_data(vdata); } void update_vsyscall_tz(void) { struct vdso_time_data *vdata = vdso_k_time_data; vdata->tz_minuteswest = sys_tz.tz_minuteswest; vdata->tz_dsttime = sys_tz.tz_dsttime; __arch_sync_vdso_time_data(vdata); } #ifdef CONFIG_POSIX_AUX_CLOCKS void vdso_time_update_aux(struct timekeeper *tk) { struct vdso_time_data *vdata = vdso_k_time_data; struct vdso_timestamp *vdso_ts; struct vdso_clock *vc; s32 clock_mode; u64 nsec; vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST]; vdso_ts = &vc->basetime[VDSO_BASE_AUX]; clock_mode = tk->tkr_mono.clock->vdso_clock_mode; if (!tk->clock_valid) clock_mode = VDSO_CLOCKMODE_NONE; /* copy vsyscall data */ vdso_write_begin_clock(vc); vc->clock_mode = clock_mode; if (clock_mode != VDSO_CLOCKMODE_NONE) { fill_clock_configuration(vc, &tk->tkr_mono); vdso_ts->sec = tk->xtime_sec + tk->monotonic_to_aux.tv_sec; nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; nsec += tk->monotonic_to_aux.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); nsec = nsec << tk->tkr_mono.shift; vdso_ts->nsec = nsec; } __arch_update_vdso_clock(vc); vdso_write_end_clock(vc); __arch_sync_vdso_time_data(vdata); } #endif /** * vdso_update_begin - Start of a VDSO update section * * Allows architecture code to safely update the architecture specific VDSO * data. Disables interrupts, acquires timekeeper lock to serialize against * concurrent updates from timekeeping and invalidates the VDSO data * sequence counter to prevent concurrent readers from accessing * inconsistent data. * * Returns: Saved interrupt flags which need to be handed in to * vdso_update_end(). */ unsigned long vdso_update_begin(void) { struct vdso_time_data *vdata = vdso_k_time_data; unsigned long flags = timekeeper_lock_irqsave(); vdso_write_begin(vdata); return flags; } /** * vdso_update_end - End of a VDSO update section * @flags: Interrupt flags as returned from vdso_update_begin() * * Pairs with vdso_update_begin(). Marks vdso data consistent, invokes data * synchronization if the architecture requires it, drops timekeeper lock * and restores interrupt flags. */ void vdso_update_end(unsigned long flags) { struct vdso_time_data *vdata = vdso_k_time_data; vdso_write_end(vdata); __arch_sync_vdso_time_data(vdata); timekeeper_unlock_irqrestore(flags); } |
| 1782 919 919 919 919 28 28 28 2 9 9 8 1 1386 1387 1384 1387 1388 1388 6 5 3 3 1 3 3 6 6 6 2 6 6 6 6 5 6 11 11 2 5 5 4 2 6 11 11 2 11 11 4 11 11 11 6 2 5 11 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 | /* * cgroup_freezer.c - control group freezer subsystem * * Copyright IBM Corporation, 2007 * * Author : Cedric Le Goater <clg@fr.ibm.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ #include <linux/export.h> #include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/freezer.h> #include <linux/seq_file.h> #include <linux/mutex.h> #include <linux/cpu.h> /* * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of * its ancestors has FREEZING_SELF set. */ enum freezer_state_flags { CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ /* mask for all FREEZING flags */ CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, }; struct freezer { struct cgroup_subsys_state css; unsigned int state; }; static DEFINE_MUTEX(freezer_mutex); static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) { return css ? container_of(css, struct freezer, css) : NULL; } static inline struct freezer *task_freezer(struct task_struct *task) { return css_freezer(task_css(task, freezer_cgrp_id)); } static struct freezer *parent_freezer(struct freezer *freezer) { return css_freezer(freezer->css.parent); } bool cgroup_freezing(struct task_struct *task) { bool ret; rcu_read_lock(); ret = task_freezer(task)->state & CGROUP_FREEZING; rcu_read_unlock(); return ret; } static const char *freezer_state_strs(unsigned int state) { if (state & CGROUP_FROZEN) return "FROZEN"; if (state & CGROUP_FREEZING) return "FREEZING"; return "THAWED"; }; static struct cgroup_subsys_state * freezer_css_alloc(struct cgroup_subsys_state *parent_css) { struct freezer *freezer; freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); if (!freezer) return ERR_PTR(-ENOMEM); return &freezer->css; } /** * freezer_css_online - commit creation of a freezer css * @css: css being created * * We're committing to creation of @css. Mark it online and inherit * parent's freezing state while holding cpus read lock and freezer_mutex. */ static int freezer_css_online(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); cpus_read_lock(); mutex_lock(&freezer_mutex); freezer->state |= CGROUP_FREEZER_ONLINE; if (parent && (parent->state & CGROUP_FREEZING)) { freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; static_branch_inc_cpuslocked(&freezer_active); } mutex_unlock(&freezer_mutex); cpus_read_unlock(); return 0; } /** * freezer_css_offline - initiate destruction of a freezer css * @css: css being destroyed * * @css is going away. Mark it dead and decrement freezer_active if * it was holding one. */ static void freezer_css_offline(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); cpus_read_lock(); mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) static_branch_dec_cpuslocked(&freezer_active); freezer->state = 0; mutex_unlock(&freezer_mutex); cpus_read_unlock(); } static void freezer_css_free(struct cgroup_subsys_state *css) { kfree(css_freezer(css)); } /* * Tasks can be migrated into a different freezer anytime regardless of its * current state. freezer_attach() is responsible for making new tasks * conform to the current state. * * Freezer state changes and task migration are synchronized via * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ static void freezer_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *new_css; mutex_lock(&freezer_mutex); /* * Make the new tasks conform to the current state of @new_css. * For simplicity, when migrating any task to a FROZEN cgroup, we * revert it to FREEZING and let update_if_frozen() determine the * correct state later. * * Tasks in @tset are on @new_css but may not conform to its * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ cgroup_taskset_for_each(task, new_css, tset) { struct freezer *freezer = css_freezer(new_css); if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { /* clear FROZEN and propagate upwards */ while (freezer && (freezer->state & CGROUP_FROZEN)) { freezer->state &= ~CGROUP_FROZEN; freezer = parent_freezer(freezer); } freeze_task(task); } } mutex_unlock(&freezer_mutex); } /** * freezer_fork - cgroup post fork callback * @task: a task which has just been forked * * @task has just been created and should conform to the current state of * the cgroup_freezer it belongs to. This function may race against * freezer_attach(). Losing to freezer_attach() means that we don't have * to do anything as freezer_attach() will put @task into the appropriate * state. */ static void freezer_fork(struct task_struct *task) { struct freezer *freezer; /* * The root cgroup is non-freezable, so we can skip locking the * freezer. This is safe regardless of race with task migration. * If we didn't race or won, skipping is obviously the right thing * to do. If we lost and root is the new cgroup, noop is still the * right thing to do. */ if (task_css_is_root(task, freezer_cgrp_id)) return; mutex_lock(&freezer_mutex); rcu_read_lock(); freezer = task_freezer(task); if (freezer->state & CGROUP_FREEZING) freeze_task(task); rcu_read_unlock(); mutex_unlock(&freezer_mutex); } /** * update_if_frozen - update whether a cgroup finished freezing * @css: css of interest * * Once FREEZING is initiated, transition to FROZEN is lazily updated by * calling this function. If the current state is FREEZING but not FROZEN, * this function checks whether all tasks of this cgroup and the descendant * cgroups finished freezing and, if so, sets FROZEN. * * The caller is responsible for grabbing RCU read lock and calling * update_if_frozen() on all descendants prior to invoking this function. * * Task states and freezer state might disagree while tasks are being * migrated into or out of @css, so we can't verify task states against * @freezer state here. See freezer_attach() for details. */ static void update_if_frozen(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); struct cgroup_subsys_state *pos; struct css_task_iter it; struct task_struct *task; lockdep_assert_held(&freezer_mutex); if (!(freezer->state & CGROUP_FREEZING) || (freezer->state & CGROUP_FROZEN)) return; /* are all (live) children frozen? */ rcu_read_lock(); css_for_each_child(pos, css) { struct freezer *child = css_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && !(child->state & CGROUP_FROZEN)) { rcu_read_unlock(); return; } } rcu_read_unlock(); /* are all tasks frozen? */ css_task_iter_start(css, 0, &it); while ((task = css_task_iter_next(&it))) { if (freezing(task) && !frozen(task)) goto out_iter_end; } freezer->state |= CGROUP_FROZEN; out_iter_end: css_task_iter_end(&it); } static int freezer_read(struct seq_file *m, void *v) { struct cgroup_subsys_state *css = seq_css(m), *pos; mutex_lock(&freezer_mutex); rcu_read_lock(); /* update states bottom-up */ css_for_each_descendant_post(pos, css) { if (!css_tryget_online(pos)) continue; rcu_read_unlock(); update_if_frozen(pos); rcu_read_lock(); css_put(pos); } rcu_read_unlock(); mutex_unlock(&freezer_mutex); seq_puts(m, freezer_state_strs(css_freezer(css)->state)); seq_putc(m, '\n'); return 0; } static void freeze_cgroup(struct freezer *freezer) { struct css_task_iter it; struct task_struct *task; css_task_iter_start(&freezer->css, 0, &it); while ((task = css_task_iter_next(&it))) freeze_task(task); css_task_iter_end(&it); } static void unfreeze_cgroup(struct freezer *freezer) { struct css_task_iter it; struct task_struct *task; css_task_iter_start(&freezer->css, 0, &it); while ((task = css_task_iter_next(&it))) __thaw_task(task); css_task_iter_end(&it); } /** * freezer_apply_state - apply state change to a single cgroup_freezer * @freezer: freezer to apply state change to * @freeze: whether to freeze or unfreeze * @state: CGROUP_FREEZING_* flag to set or clear * * Set or clear @state on @cgroup according to @freeze, and perform * freezing or thawing as necessary. */ static void freezer_apply_state(struct freezer *freezer, bool freeze, unsigned int state) { /* also synchronizes against task migration, see freezer_attach() */ lockdep_assert_held(&freezer_mutex); if (!(freezer->state & CGROUP_FREEZER_ONLINE)) return; if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) static_branch_inc_cpuslocked(&freezer_active); freezer->state |= state; freeze_cgroup(freezer); } else { bool was_freezing = freezer->state & CGROUP_FREEZING; freezer->state &= ~state; if (!(freezer->state & CGROUP_FREEZING)) { freezer->state &= ~CGROUP_FROZEN; if (was_freezing) static_branch_dec_cpuslocked(&freezer_active); unfreeze_cgroup(freezer); } } } /** * freezer_change_state - change the freezing state of a cgroup_freezer * @freezer: freezer of interest * @freeze: whether to freeze or thaw * * Freeze or thaw @freezer according to @freeze. The operations are * recursive - all descendants of @freezer will be affected. */ static void freezer_change_state(struct freezer *freezer, bool freeze) { struct cgroup_subsys_state *pos; cpus_read_lock(); /* * Update all its descendants in pre-order traversal. Each * descendant will try to inherit its parent's FREEZING state as * CGROUP_FREEZING_PARENT. */ mutex_lock(&freezer_mutex); rcu_read_lock(); css_for_each_descendant_pre(pos, &freezer->css) { struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); if (!css_tryget_online(pos)) continue; rcu_read_unlock(); if (pos_f == freezer) freezer_apply_state(pos_f, freeze, CGROUP_FREEZING_SELF); else freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, CGROUP_FREEZING_PARENT); rcu_read_lock(); css_put(pos); } rcu_read_unlock(); mutex_unlock(&freezer_mutex); cpus_read_unlock(); } static ssize_t freezer_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { bool freeze; buf = strstrip(buf); if (strcmp(buf, freezer_state_strs(0)) == 0) freeze = false; else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) { pr_info_once("Freezing with imperfect legacy cgroup freezer. " "See cgroup.freeze of cgroup v2\n"); freeze = true; } else return -EINVAL; freezer_change_state(css_freezer(of_css(of)), freeze); return nbytes; } static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_SELF); } static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, struct cftype *cft) { struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_PARENT); } static struct cftype files[] = { { .name = "state", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = freezer_read, .write = freezer_write, }, { .name = "self_freezing", .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = freezer_self_freezing_read, }, { .name = "parent_freezing", .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = freezer_parent_freezing_read, }, { } /* terminate */ }; struct cgroup_subsys freezer_cgrp_subsys = { .css_alloc = freezer_css_alloc, .css_online = freezer_css_online, .css_offline = freezer_css_offline, .css_free = freezer_css_free, .attach = freezer_attach, .fork = freezer_fork, .legacy_cftypes = files, }; |
| 11 11 233 5 159 596 188 156 32 189 1 187 32 155 177 177 29 149 188 1 188 183 183 14 14 14 14 14 11 14 203 189 14 434 214 220 430 429 2 424 425 175 265 429 1 1 139 12 135 139 95 44 30 30 19 11 11 30 30 8 30 139 139 139 139 44 18 80 32 63 112 28 139 28 112 139 95 44 22 22 1 21 30 19 11 4584 4583 4583 4590 4585 11 11 11 11 11 11 22 22 22 25 25 2 7 16 8 1 7 7 3811 3822 3820 3818 3 3818 1645 3818 6050 3746 6049 3811 6053 6050 5961 103 6052 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 | // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/file.c - kernfs file implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/uio.h> #include "kernfs-internal.h" struct kernfs_open_node { struct rcu_head rcu_head; atomic_t event; wait_queue_head_t poll; struct list_head files; /* goes through kernfs_open_file.list */ unsigned int nr_mmapped; unsigned int nr_to_release; }; /* * kernfs_notify() may be called from any context and bounces notifications * through a work item. To minimize space overhead in kernfs_node, the * pending queue is implemented as a singly linked list of kernfs_nodes. * The list is terminated with the self pointer so that whether a * kernfs_node is on the list or not can be determined by testing the next * pointer for %NULL. */ #define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) static DEFINE_SPINLOCK(kernfs_notify_lock); static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node *kn) { int idx = hash_ptr(kn, NR_KERNFS_LOCK_BITS); return &kernfs_locks->open_file_mutex[idx]; } static inline struct mutex *kernfs_open_file_mutex_lock(struct kernfs_node *kn) { struct mutex *lock; lock = kernfs_open_file_mutex_ptr(kn); mutex_lock(lock); return lock; } /** * of_on - Get the kernfs_open_node of the specified kernfs_open_file * @of: target kernfs_open_file * * Return: the kernfs_open_node of the kernfs_open_file */ static struct kernfs_open_node *of_on(struct kernfs_open_file *of) { return rcu_dereference_protected(of->kn->attr.open, !list_empty(&of->list)); } /* Get active reference to kernfs node for an open file */ static struct kernfs_open_file *kernfs_get_active_of(struct kernfs_open_file *of) { /* Skip if file was already released */ if (unlikely(of->released)) return NULL; if (!kernfs_get_active(of->kn)) return NULL; return of; } static void kernfs_put_active_of(struct kernfs_open_file *of) { return kernfs_put_active(of->kn); } /** * kernfs_deref_open_node_locked - Get kernfs_open_node corresponding to @kn * * @kn: target kernfs_node. * * Fetch and return ->attr.open of @kn when caller holds the * kernfs_open_file_mutex_ptr(kn). * * Update of ->attr.open happens under kernfs_open_file_mutex_ptr(kn). So when * the caller guarantees that this mutex is being held, other updaters can't * change ->attr.open and this means that we can safely deref ->attr.open * outside RCU read-side critical section. * * The caller needs to make sure that kernfs_open_file_mutex is held. * * Return: @kn->attr.open when kernfs_open_file_mutex is held. */ static struct kernfs_open_node * kernfs_deref_open_node_locked(struct kernfs_node *kn) { return rcu_dereference_protected(kn->attr.open, lockdep_is_held(kernfs_open_file_mutex_ptr(kn))); } static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; } /* * Determine the kernfs_ops for the given kernfs_node. This function must * be called while holding an active reference. */ static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn) { if (kn->flags & KERNFS_LOCKDEP) lockdep_assert_held(kn); return kn->attr.ops; } /* * As kernfs_seq_stop() is also called after kernfs_seq_start() or * kernfs_seq_next() failure, it needs to distinguish whether it's stopping * a seq_file iteration which is fully initialized with an active reference * or an aborted kernfs_seq_start() due to get_active failure. The * position pointer is the only context for each seq_file iteration and * thus the stop condition should be encoded in it. As the return value is * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable * choice to indicate get_active failure. * * Unfortunately, this is complicated due to the optional custom seq_file * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop() * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or * custom seq_file operations and thus can't decide whether put_active * should be performed or not only on ERR_PTR(-ENODEV). * * This is worked around by factoring out the custom seq_stop() and * put_active part into kernfs_seq_stop_active(), skipping it from * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures * that kernfs_seq_stop_active() is skipped only after get_active failure. */ static void kernfs_seq_stop_active(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_stop) ops->seq_stop(sf, v); kernfs_put_active_of(of); } static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops; /* * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active_of(of)) return ERR_PTR(-ENODEV); ops = kernfs_ops(of->kn); if (ops->seq_start) { void *next = ops->seq_start(sf, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } return single_start(sf, ppos); } static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_next) { void *next = ops->seq_next(sf, v, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } else { /* * The same behavior and code as single_open(), always * terminate after the initial read. */ ++*ppos; return NULL; } } static void kernfs_seq_stop(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; if (v != ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, v); mutex_unlock(&of->mutex); } static int kernfs_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; of->event = atomic_read(&of_on(of)->event); return of->kn->attr.ops->seq_show(sf, v); } static const struct seq_operations kernfs_seq_ops = { .start = kernfs_seq_start, .next = kernfs_seq_next, .stop = kernfs_seq_stop, .show = kernfs_seq_show, }; /* * As reading a bin file can have side-effects, the exact offset and bytes * specified in read(2) call should be passed to the read callback making * it difficult to use seq_file. Implement simplistic custom buffering for * bin files. */ static ssize_t kernfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = min_t(size_t, iov_iter_count(iter), PAGE_SIZE); const struct kernfs_ops *ops; char *buf; buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active_of(of)) { len = -ENODEV; mutex_unlock(&of->mutex); goto out_free; } of->event = atomic_read(&of_on(of)->event); ops = kernfs_ops(of->kn); if (ops->read) len = ops->read(of, buf, len, iocb->ki_pos); else len = -EINVAL; kernfs_put_active_of(of); mutex_unlock(&of->mutex); if (len < 0) goto out_free; if (copy_to_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static ssize_t kernfs_fop_read_iter(struct kiocb *iocb, struct iov_iter *iter) { if (kernfs_of(iocb->ki_filp)->kn->flags & KERNFS_HAS_SEQ_SHOW) return seq_read_iter(iocb, iter); return kernfs_file_read_iter(iocb, iter); } /* * Copy data in from userland and pass it to the matching kernfs write * operation. * * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come on * the first write. Hint: if you're writing a value, first read the file, * modify only the value you're changing, then write entire buffer * back. */ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter) { struct kernfs_open_file *of = kernfs_of(iocb->ki_filp); ssize_t len = iov_iter_count(iter); const struct kernfs_ops *ops; char *buf; if (of->atomic_write_len) { if (len > of->atomic_write_len) return -E2BIG; } else { len = min_t(size_t, len, PAGE_SIZE); } buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_iter(buf, len, iter) != len) { len = -EFAULT; goto out_free; } buf[len] = '\0'; /* guarantee string termination */ /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active_of(of)) { mutex_unlock(&of->mutex); len = -ENODEV; goto out_free; } ops = kernfs_ops(of->kn); if (ops->write) len = ops->write(of, buf, len, iocb->ki_pos); else len = -EINVAL; kernfs_put_active_of(of); mutex_unlock(&of->mutex); if (len > 0) iocb->ki_pos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static void kernfs_vma_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); if (!of->vm_ops) return; if (!kernfs_get_active_of(of)) return; if (of->vm_ops->open) of->vm_ops->open(vma); kernfs_put_active_of(of); } static vm_fault_t kernfs_vma_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active_of(of)) return VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS; if (of->vm_ops->fault) ret = of->vm_ops->fault(vmf); kernfs_put_active_of(of); return ret; } static vm_fault_t kernfs_vma_page_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); vm_fault_t ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active_of(of)) return VM_FAULT_SIGBUS; ret = 0; if (of->vm_ops->page_mkwrite) ret = of->vm_ops->page_mkwrite(vmf); else file_update_time(file); kernfs_put_active_of(of); return ret; } static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; if (!of->vm_ops) return -EINVAL; if (!kernfs_get_active_of(of)) return -EINVAL; ret = -EINVAL; if (of->vm_ops->access) ret = of->vm_ops->access(vma, addr, buf, len, write); kernfs_put_active_of(of); return ret; } static const struct vm_operations_struct kernfs_vm_ops = { .open = kernfs_vma_open, .fault = kernfs_vma_fault, .page_mkwrite = kernfs_vma_page_mkwrite, .access = kernfs_vma_access, }; static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) { struct kernfs_open_file *of = kernfs_of(file); const struct kernfs_ops *ops; int rc; /* * mmap path and of->mutex are prone to triggering spurious lockdep * warnings and we don't want to add spurious locking dependency * between the two. Check whether mmap is actually implemented * without grabbing @of->mutex by testing HAS_MMAP flag. See the * comment in kernfs_fop_open() for more details. */ if (!(of->kn->flags & KERNFS_HAS_MMAP)) return -ENODEV; mutex_lock(&of->mutex); rc = -ENODEV; if (!kernfs_get_active_of(of)) goto out_unlock; ops = kernfs_ops(of->kn); rc = ops->mmap(of, vma); if (rc) goto out_put; /* * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() * to satisfy versions of X which crash if the mmap fails: that * substitutes a new vm_file, and we don't then want bin_vm_ops. */ if (vma->vm_file != file) goto out_put; rc = -EINVAL; if (of->mmapped && of->vm_ops != vma->vm_ops) goto out_put; /* * It is not possible to successfully wrap close. * So error if someone is trying to use close. */ if (vma->vm_ops && vma->vm_ops->close) goto out_put; rc = 0; if (!of->mmapped) { of->mmapped = true; of_on(of)->nr_mmapped++; of->vm_ops = vma->vm_ops; } vma->vm_ops = &kernfs_vm_ops; out_put: kernfs_put_active_of(of); out_unlock: mutex_unlock(&of->mutex); return rc; } /** * kernfs_get_open_node - get or create kernfs_open_node * @kn: target kernfs_node * @of: kernfs_open_file for this instance of open * * If @kn->attr.open exists, increment its reference count; otherwise, * create one. @of is chained to the files list. * * Locking: * Kernel thread context (may sleep). * * Return: * %0 on success, -errno on failure. */ static int kernfs_get_open_node(struct kernfs_node *kn, struct kernfs_open_file *of) { struct kernfs_open_node *on; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { /* not there, initialize a new one */ on = kzalloc(sizeof(*on), GFP_KERNEL); if (!on) { mutex_unlock(mutex); return -ENOMEM; } atomic_set(&on->event, 1); init_waitqueue_head(&on->poll); INIT_LIST_HEAD(&on->files); rcu_assign_pointer(kn->attr.open, on); } list_add_tail(&of->list, &on->files); if (kn->flags & KERNFS_HAS_RELEASE) on->nr_to_release++; mutex_unlock(mutex); return 0; } /** * kernfs_unlink_open_file - Unlink @of from @kn. * * @kn: target kernfs_node * @of: associated kernfs_open_file * @open_failed: ->open() failed, cancel ->release() * * Unlink @of from list of @kn's associated open files. If list of * associated open files becomes empty, disassociate and free * kernfs_open_node. * * LOCKING: * None. */ static void kernfs_unlink_open_file(struct kernfs_node *kn, struct kernfs_open_file *of, bool open_failed) { struct kernfs_open_node *on; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } if (of) { if (kn->flags & KERNFS_HAS_RELEASE) { WARN_ON_ONCE(of->released == open_failed); if (open_failed) on->nr_to_release--; } if (of->mmapped) on->nr_mmapped--; list_del(&of->list); } if (list_empty(&on->files)) { rcu_assign_pointer(kn->attr.open, NULL); kfree_rcu(on, rcu_head); } mutex_unlock(mutex); } static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; int error = -EACCES; if (!kernfs_get_active(kn)) return -ENODEV; ops = kernfs_ops(kn); has_read = ops->seq_show || ops->read || ops->mmap; has_write = ops->write || ops->mmap; has_mmap = ops->mmap; /* see the flag definition for details */ if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { if ((file->f_mode & FMODE_WRITE) && (!(inode->i_mode & S_IWUGO) || !has_write)) goto err_out; if ((file->f_mode & FMODE_READ) && (!(inode->i_mode & S_IRUGO) || !has_read)) goto err_out; } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL); if (!of) goto err_out; /* * The following is done to give a different lockdep key to * @of->mutex for files which implement mmap. This is a rather * crude way to avoid false positive lockdep warning around * mm->mmap_lock - mmap nests @of->mutex under mm->mmap_lock and * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under * which mm->mmap_lock nests, while holding @of->mutex. As each * open file has a separate mutex, it's okay as long as those don't * happen on the same file. At this point, we can't easily give * each file a separate locking class. Let's differentiate on * whether the file has mmap or not for now. * * For similar reasons, writable and readonly files are given different * lockdep key, because the writable file /sys/power/resume may call vfs * lookup helpers for arbitrary paths and readonly files can be read by * overlayfs from vfs helpers when sysfs is a lower layer of overalyfs. * * All three cases look the same. They're supposed to * look that way and give @of->mutex different static lockdep keys. */ if (has_mmap) mutex_init(&of->mutex); else if (file->f_mode & FMODE_WRITE) mutex_init(&of->mutex); else mutex_init(&of->mutex); of->kn = kn; of->file = file; /* * Write path needs to atomic_write_len outside active reference. * Cache it in open_file. See kernfs_fop_write_iter() for details. */ of->atomic_write_len = ops->atomic_write_len; error = -EINVAL; /* * ->seq_show is incompatible with ->prealloc, * as seq_read does its own allocation. * ->read must be used instead. */ if (ops->prealloc && ops->seq_show) goto err_free; if (ops->prealloc) { int len = of->atomic_write_len ?: PAGE_SIZE; of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL); error = -ENOMEM; if (!of->prealloc_buf) goto err_free; mutex_init(&of->prealloc_mutex); } /* * Always instantiate seq_file even if read access doesn't use * seq_file or is not requested. This unifies private data access * and readable regular files are the vast majority anyway. */ if (ops->seq_show) error = seq_open(file, &kernfs_seq_ops); else error = seq_open(file, NULL); if (error) goto err_free; of->seq_file = file->private_data; of->seq_file->private = of; /* seq_file clears PWRITE unconditionally, restore it if WRITE */ if (file->f_mode & FMODE_WRITE) file->f_mode |= FMODE_PWRITE; /* make sure we have open node struct */ error = kernfs_get_open_node(kn, of); if (error) goto err_seq_release; if (ops->open) { /* nobody has access to @of yet, skip @of->mutex */ error = ops->open(of); if (error) goto err_put_node; } /* open succeeded, put active references */ kernfs_put_active(kn); return 0; err_put_node: kernfs_unlink_open_file(kn, of, true); err_seq_release: seq_release(inode, file); err_free: kfree(of->prealloc_buf); kfree(of); err_out: kernfs_put_active(kn); return error; } /* used from release/drain to ensure that ->release() is called exactly once */ static void kernfs_release_file(struct kernfs_node *kn, struct kernfs_open_file *of) { /* * @of is guaranteed to have no other file operations in flight and * we just want to synchronize release and drain paths. * @kernfs_open_file_mutex_ptr(kn) is enough. @of->mutex can't be used * here because drain path may be called from places which can * cause circular dependency. */ lockdep_assert_held(kernfs_open_file_mutex_ptr(kn)); if (!of->released) { /* * A file is never detached without being released and we * need to be able to release files which are deactivated * and being drained. Don't use kernfs_ops(). */ kn->attr.ops->release(of); of->released = true; of_on(of)->nr_to_release--; } } static int kernfs_fop_release(struct inode *inode, struct file *filp) { struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); if (kn->flags & KERNFS_HAS_RELEASE) { struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); kernfs_release_file(kn, of); mutex_unlock(mutex); } kernfs_unlink_open_file(kn, of, false); seq_release(inode, filp); kfree(of->prealloc_buf); kfree(of); return 0; } bool kernfs_should_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; bool ret; /* * @kn being deactivated guarantees that @kn->attr.open can't change * beneath us making the lockless test below safe. * Callers post kernfs_unbreak_active_protection may be counted in * kn->active by now, do not WARN_ON because of them. */ rcu_read_lock(); on = rcu_dereference(kn->attr.open); ret = on && (on->nr_mmapped || on->nr_to_release); rcu_read_unlock(); return ret; } void kernfs_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; struct kernfs_open_file *of; struct mutex *mutex; mutex = kernfs_open_file_mutex_lock(kn); on = kernfs_deref_open_node_locked(kn); if (!on) { mutex_unlock(mutex); return; } list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); if (of->mmapped) { unmap_mapping_range(inode->i_mapping, 0, 0, 1); of->mmapped = false; on->nr_mmapped--; } if (kn->flags & KERNFS_HAS_RELEASE) kernfs_release_file(kn, of); } WARN_ON_ONCE(on->nr_mmapped || on->nr_to_release); mutex_unlock(mutex); } /* * Kernfs attribute files are pollable. The idea is that you read * the content and then you use 'poll' or 'select' to wait for * the content to change. When the content changes (assuming the * manager for the kobject supports notification), poll will * return EPOLLERR|EPOLLPRI, and select will return the fd whether * it is waiting for read, write, or exceptions. * Once poll/select indicates that the value has changed, you * need to close and re-open the file, or seek to 0 and read again. * Reminder: this only works for attributes which actively support * it, and it is not possible to test an attribute from userspace * to see if it supports poll (Neither 'poll' nor 'select' return * an appropriate error code). When in doubt, set a suitable timeout value. */ __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) { struct kernfs_open_node *on = of_on(of); poll_wait(of->file, &on->poll, wait); if (of->event != atomic_read(&on->event)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; return DEFAULT_POLLMASK; } static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); __poll_t ret; if (!kernfs_get_active_of(of)) return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; if (kn->attr.ops->poll) ret = kn->attr.ops->poll(of, wait); else ret = kernfs_generic_poll(of, wait); kernfs_put_active_of(of); return ret; } static loff_t kernfs_fop_llseek(struct file *file, loff_t offset, int whence) { struct kernfs_open_file *of = kernfs_of(file); const struct kernfs_ops *ops; loff_t ret; /* * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active_of(of)) { mutex_unlock(&of->mutex); return -ENODEV; } ops = kernfs_ops(of->kn); if (ops->llseek) ret = ops->llseek(of, offset, whence); else ret = generic_file_llseek(file, offset, whence); kernfs_put_active_of(of); mutex_unlock(&of->mutex); return ret; } static void kernfs_notify_workfn(struct work_struct *work) { struct kernfs_node *kn; struct kernfs_super_info *info; struct kernfs_root *root; repeat: /* pop one off the notify_list */ spin_lock_irq(&kernfs_notify_lock); kn = kernfs_notify_list; if (kn == KERNFS_NOTIFY_EOL) { spin_unlock_irq(&kernfs_notify_lock); return; } kernfs_notify_list = kn->attr.notify_next; kn->attr.notify_next = NULL; spin_unlock_irq(&kernfs_notify_lock); root = kernfs_root(kn); /* kick fsnotify */ down_read(&root->kernfs_supers_rwsem); down_read(&root->kernfs_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; struct inode *p_inode = NULL; const char *kn_name; struct inode *inode; struct qstr name; /* * We want fsnotify_modify() on @kn but as the * modifications aren't originating from userland don't * have the matching @file available. Look up the inodes * and generate the events manually. */ inode = ilookup(info->sb, kernfs_ino(kn)); if (!inode) continue; kn_name = kernfs_rcu_name(kn); name = QSTR(kn_name); parent = kernfs_get_parent(kn); if (parent) { p_inode = ilookup(info->sb, kernfs_ino(parent)); if (p_inode) { fsnotify(FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, p_inode, &name, inode, 0); iput(p_inode); } kernfs_put(parent); } if (!p_inode) fsnotify_inode(inode, FS_MODIFY); iput(inode); } up_read(&root->kernfs_rwsem); up_read(&root->kernfs_supers_rwsem); kernfs_put(kn); goto repeat; } /** * kernfs_notify - notify a kernfs file * @kn: file to notify * * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any * context. */ void kernfs_notify(struct kernfs_node *kn) { static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); unsigned long flags; struct kernfs_open_node *on; if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) return; /* kick poll immediately */ rcu_read_lock(); on = rcu_dereference(kn->attr.open); if (on) { atomic_inc(&on->event); wake_up_interruptible(&on->poll); } rcu_read_unlock(); /* schedule work to kick fsnotify */ spin_lock_irqsave(&kernfs_notify_lock, flags); if (!kn->attr.notify_next) { kernfs_get(kn); kn->attr.notify_next = kernfs_notify_list; kernfs_notify_list = kn; schedule_work(&kernfs_notify_work); } spin_unlock_irqrestore(&kernfs_notify_lock, flags); } EXPORT_SYMBOL_GPL(kernfs_notify); const struct file_operations kernfs_file_fops = { .read_iter = kernfs_fop_read_iter, .write_iter = kernfs_fop_write_iter, .llseek = kernfs_fop_llseek, .mmap = kernfs_fop_mmap, .open = kernfs_fop_open, .release = kernfs_fop_release, .poll = kernfs_fop_poll, .fsync = noop_fsync, .splice_read = copy_splice_read, .splice_write = iter_file_splice_write, }; /** * __kernfs_create_file - kernfs internal function to create a file * @parent: directory to create the file in * @name: name of the file * @mode: mode of the file * @uid: uid of the file * @gid: gid of the file * @size: size of the file * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Return: the created node on success, ERR_PTR() value on error. */ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { struct kernfs_node *kn; unsigned flags; int rc; flags = KERNFS_FILE; kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, uid, gid, flags); if (!kn) return ERR_PTR(-ENOMEM); kn->attr.ops = ops; kn->attr.size = size; kn->ns = ns; kn->priv = priv; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (key) { lockdep_init_map(&kn->dep_map, "kn->active", key, 0); kn->flags |= KERNFS_LOCKDEP; } #endif /* * kn->attr.ops is accessible only while holding active ref. We * need to know whether some ops are implemented outside active * ref. Cache their existence in flags. */ if (ops->seq_show) kn->flags |= KERNFS_HAS_SEQ_SHOW; if (ops->mmap) kn->flags |= KERNFS_HAS_MMAP; if (ops->release) kn->flags |= KERNFS_HAS_RELEASE; rc = kernfs_add_one(kn); if (rc) { kernfs_put(kn); return ERR_PTR(rc); } return kn; } |
| 1683 9433 809 4819 3780 807 6747 6733 6753 6734 1369 807 807 807 807 809 1325 1325 1356 1359 1358 1353 1359 1358 12 12 12 4967 4968 4969 4972 4963 4976 4961 4961 4968 3789 2373 3780 3782 3786 3782 3782 2382 4969 2991 4950 4958 2393 3785 3789 17 82 82 4958 4953 402 403 403 401 402 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 | // SPDX-License-Identifier: GPL-2.0-only /* * Generic helpers for smp ipi calls * * (C) Jens Axboe <jens.axboe@oracle.com> 2008 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/irq_work.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/sched.h> #include <linux/sched/idle.h> #include <linux/hypervisor.h> #include <linux/sched/clock.h> #include <linux/nmi.h> #include <linux/sched/debug.h> #include <linux/jump_label.h> #include <linux/string_choices.h> #include <trace/events/ipi.h> #define CREATE_TRACE_POINTS #include <trace/events/csd.h> #undef CREATE_TRACE_POINTS #include "smpboot.h" #include "sched/smp.h" #define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK) struct call_function_data { call_single_data_t __percpu *csd; cpumask_var_t cpumask; cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1); static void __flush_smp_call_function_queue(bool warn_cpu_offline); int smpcfd_prepare_cpu(unsigned int cpu) { struct call_function_data *cfd = &per_cpu(cfd_data, cpu); if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return -ENOMEM; if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, cpu_to_node(cpu))) { free_cpumask_var(cfd->cpumask); return -ENOMEM; } cfd->csd = alloc_percpu(call_single_data_t); if (!cfd->csd) { free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); return -ENOMEM; } return 0; } int smpcfd_dead_cpu(unsigned int cpu) { struct call_function_data *cfd = &per_cpu(cfd_data, cpu); free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->csd); return 0; } int smpcfd_dying_cpu(unsigned int cpu) { /* * The IPIs for the smp-call-function callbacks queued by other CPUs * might arrive late, either due to hardware latencies or because this * CPU disabled interrupts (inside stop-machine) before the IPIs were * sent. So flush out any pending callbacks explicitly (without waiting * for the IPIs to arrive), to ensure that the outgoing CPU doesn't go * offline with work still pending. * * This runs with interrupts disabled inside the stopper task invoked by * stop_machine(), ensuring mutually exclusive CPU offlining and IPI flush. */ __flush_smp_call_function_queue(false); irq_work_run(); return 0; } void __init call_function_init(void) { int i; for_each_possible_cpu(i) init_llist_head(&per_cpu(call_single_queue, i)); smpcfd_prepare_cpu(smp_processor_id()); } static __always_inline void send_call_function_single_ipi(int cpu) { if (call_function_single_prep_ipi(cpu)) { trace_ipi_send_cpu(cpu, _RET_IP_, generic_smp_call_function_single_interrupt); arch_send_call_function_single_ipi(cpu); } } static __always_inline void send_call_function_ipi_mask(struct cpumask *mask) { trace_ipi_send_cpumask(mask, _RET_IP_, generic_smp_call_function_single_interrupt); arch_send_call_function_ipi_mask(mask); } static __always_inline void csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd) { trace_csd_function_entry(func, csd); func(info); trace_csd_function_exit(func, csd); } #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled); /* * Parse the csdlock_debug= kernel boot parameter. * * If you need to restore the old "ext" value that once provided * additional debugging information, reapply the following commits: * * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging") * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging") */ static int __init csdlock_debug(char *str) { int ret; unsigned int val = 0; ret = get_option(&str, &val); if (ret) { if (val) static_branch_enable(&csdlock_debug_enabled); else static_branch_disable(&csdlock_debug_enabled); } return 1; } __setup("csdlock_debug=", csdlock_debug); static DEFINE_PER_CPU(call_single_data_t *, cur_csd); static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */ module_param(csd_lock_timeout, ulong, 0644); static int panic_on_ipistall; /* CSD panic timeout in milliseconds, 300000 for five minutes. */ module_param(panic_on_ipistall, int, 0644); static atomic_t csd_bug_count = ATOMIC_INIT(0); /* Record current CSD work for current CPU, NULL to erase. */ static void __csd_lock_record(call_single_data_t *csd) { if (!csd) { smp_mb(); /* NULL cur_csd after unlock. */ __this_cpu_write(cur_csd, NULL); return; } __this_cpu_write(cur_csd_func, csd->func); __this_cpu_write(cur_csd_info, csd->info); smp_wmb(); /* func and info before csd. */ __this_cpu_write(cur_csd, csd); smp_mb(); /* Update cur_csd before function call. */ /* Or before unlock, as the case may be. */ } static __always_inline void csd_lock_record(call_single_data_t *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) __csd_lock_record(csd); } static int csd_lock_wait_getcpu(call_single_data_t *csd) { unsigned int csd_type; csd_type = CSD_TYPE(csd); if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */ return -1; } static atomic_t n_csd_lock_stuck; /** * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long? * * Returns @true if a CSD-lock acquisition is stuck and has been stuck * long enough for a "non-responsive CSD lock" message to be printed. */ bool csd_lock_is_stuck(void) { return !!atomic_read(&n_csd_lock_stuck); } /* * Complain if too much time spent waiting. Note that only * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages) { int cpu = -1; int cpux; bool firsttime; u64 ts2, ts_delta; call_single_data_t *cpu_cur_csd; unsigned int flags = READ_ONCE(csd->node.u_flags); unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC; if (!(flags & CSD_FLAG_LOCK)) { if (!unlikely(*bug_id)) return true; cpu = csd_lock_wait_getcpu(csd); pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", *bug_id, raw_smp_processor_id(), cpu); atomic_dec(&n_csd_lock_stuck); return true; } ts2 = ktime_get_mono_fast_ns(); /* How long since we last checked for a stuck CSD lock.*/ ts_delta = ts2 - *ts1; if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) * (!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) || csd_lock_timeout_ns == 0)) return false; if (ts0 > ts2) { /* Our own sched_clock went backward; don't blame another CPU. */ ts_delta = ts0 - ts2; pr_alert("sched_clock on CPU %d went backward by %llu ns\n", raw_smp_processor_id(), ts_delta); *ts1 = ts2; return false; } firsttime = !*bug_id; if (firsttime) *bug_id = atomic_inc_return(&csd_bug_count); cpu = csd_lock_wait_getcpu(csd); if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu)) cpux = 0; else cpux = cpu; cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ /* How long since this CSD lock was stuck. */ ts_delta = ts2 - ts0; pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n", firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta, cpu, csd->func, csd->info); (*nmessages)++; if (firsttime) atomic_inc(&n_csd_lock_stuck); /* * If the CSD lock is still stuck after 5 minutes, it is unlikely * to become unstuck. Use a signed comparison to avoid triggering * on underflows when the TSC is out of sync between sockets. */ BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC)); if (cpu_cur_csd && csd != cpu_cur_csd) { pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n", *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)), READ_ONCE(per_cpu(cur_csd_info, cpux))); } else { pr_alert("\tcsd: CSD lock (#%d) %s.\n", *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); } if (cpu >= 0) { if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0)) dump_cpu_task(cpu); if (!cpu_cur_csd) { pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); arch_send_call_function_single_ipi(cpu); } } if (firsttime) dump_stack(); *ts1 = ts2; return false; } /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources * * For non-synchronous ipi calls the csd can still be in use by the * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ static void __csd_lock_wait(call_single_data_t *csd) { unsigned long nmessages = 0; int bug_id = 0; u64 ts0, ts1; ts1 = ts0 = ktime_get_mono_fast_ns(); for (;;) { if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages)) break; cpu_relax(); } smp_acquire__after_ctrl_dep(); } static __always_inline void csd_lock_wait(call_single_data_t *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) { __csd_lock_wait(csd); return; } smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #else static void csd_lock_record(call_single_data_t *csd) { } static __always_inline void csd_lock_wait(call_single_data_t *csd) { smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #endif static __always_inline void csd_lock(call_single_data_t *csd) { csd_lock_wait(csd); csd->node.u_flags |= CSD_FLAG_LOCK; /* * prevent CPU from reordering the above assignment * to ->flags with any subsequent assignments to other * fields of the specified call_single_data_t structure: */ smp_wmb(); } static __always_inline void csd_unlock(call_single_data_t *csd) { WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: */ smp_store_release(&csd->node.u_flags, 0); } static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); void __smp_call_single_queue(int cpu, struct llist_node *node) { /* * We have to check the type of the CSD before queueing it, because * once queued it can have its flags cleared by * flush_smp_call_function_queue() * even if we haven't sent the smp_call IPI yet (e.g. the stopper * executes migration_cpu_stop() on the remote CPU). */ if (trace_csd_queue_cpu_enabled()) { call_single_data_t *csd; smp_call_func_t func; csd = container_of(node, call_single_data_t, node.llist); func = CSD_TYPE(csd) == CSD_TYPE_TTWU ? sched_ttwu_pending : csd->func; trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); } /* * The list addition should be visible to the target CPU when it pops * the head of the list to pull the entry off it in the IPI handler * because of normal cache coherency rules implied by the underlying * llist ops. * * If IPIs can go out of order to the cache coherency protocol * in an architecture, sufficient synchronisation should be added * to arch code to make it appear to obey cache coherency WRT * locking and barrier primitives. Generic code isn't really * equipped to do the right thing... */ if (llist_add(node, &per_cpu(call_single_queue, cpu))) send_call_function_single_ipi(cpu); } /* * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ static int generic_exec_single(int cpu, call_single_data_t *csd) { /* * Preemption already disabled here so stopper cannot run on this CPU, * ensuring mutually exclusive CPU offlining and last IPI flush. */ if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; void *info = csd->info; unsigned long flags; /* * We can unlock early even for the synchronous on-stack case, * since we're doing this from the same CPU.. */ csd_lock_record(csd); csd_unlock(csd); local_irq_save(flags); csd_do_func(func, info, NULL); csd_lock_record(NULL); local_irq_restore(flags); return 0; } if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { csd_unlock(csd); return -ENXIO; } __smp_call_single_queue(cpu, &csd->node.llist); return 0; } /** * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks * * Invoked by arch to handle an IPI for call function single. * Must be called with interrupts disabled. */ void generic_smp_call_function_single_interrupt(void) { __flush_smp_call_function_queue(true); } /** * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks * * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an * offline CPU. Skip this check if set to 'false'. * * Flush any pending smp-call-function callbacks queued on this CPU. This is * invoked by the generic IPI handler, as well as by a CPU about to go offline, * to ensure that all pending IPI callbacks are run before it goes completely * offline. * * Loop through the call_single_queue and run all the queued callbacks. * Must be called with interrupts disabled. */ static void __flush_smp_call_function_queue(bool warn_cpu_offline) { call_single_data_t *csd, *csd_next; struct llist_node *entry, *prev; struct llist_head *head; static bool warned; atomic_t *tbt; lockdep_assert_irqs_disabled(); /* Allow waiters to send backtrace NMI from here onwards */ tbt = this_cpu_ptr(&trigger_backtrace); atomic_set_release(tbt, 1); head = this_cpu_ptr(&call_single_queue); entry = llist_del_all(head); entry = llist_reverse_order(entry); /* There shouldn't be any pending callbacks on an offline CPU. */ if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && !warned && entry != NULL)) { warned = true; WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); /* * We don't have to use the _safe() variant here * because we are not invoking the IPI handlers yet. */ llist_for_each_entry(csd, entry, node.llist) { switch (CSD_TYPE(csd)) { case CSD_TYPE_ASYNC: case CSD_TYPE_SYNC: case CSD_TYPE_IRQ_WORK: pr_warn("IPI callback %pS sent to offline CPU\n", csd->func); break; case CSD_TYPE_TTWU: pr_warn("IPI task-wakeup sent to offline CPU\n"); break; default: pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", CSD_TYPE(csd)); break; } } } /* * First; run all SYNC callbacks, people are waiting for us. */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { /* Do we wait until *after* callback? */ if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { smp_call_func_t func = csd->func; void *info = csd->info; if (prev) { prev->next = &csd_next->node.llist; } else { entry = &csd_next->node.llist; } csd_lock_record(csd); csd_do_func(func, info, csd); csd_unlock(csd); csd_lock_record(NULL); } else { prev = &csd->node.llist; } } if (!entry) return; /* * Second; run all !SYNC callbacks. */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { int type = CSD_TYPE(csd); if (type != CSD_TYPE_TTWU) { if (prev) { prev->next = &csd_next->node.llist; } else { entry = &csd_next->node.llist; } if (type == CSD_TYPE_ASYNC) { smp_call_func_t func = csd->func; void *info = csd->info; csd_lock_record(csd); csd_unlock(csd); csd_do_func(func, info, csd); csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); } } else { prev = &csd->node.llist; } } /* * Third; only CSD_TYPE_TTWU is left, issue those. */ if (entry) { csd = llist_entry(entry, typeof(*csd), node.llist); csd_do_func(sched_ttwu_pending, entry, csd); } } /** * flush_smp_call_function_queue - Flush pending smp-call-function callbacks * from task context (idle, migration thread) * * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to * handle queued SMP function calls before scheduling. * * The migration thread has to ensure that an eventually pending wakeup has * been handled before it migrates a task. */ void flush_smp_call_function_queue(void) { unsigned int was_pending; unsigned long flags; if (llist_empty(this_cpu_ptr(&call_single_queue))) return; local_irq_save(flags); /* Get the already pending soft interrupts for RT enabled kernels */ was_pending = local_softirq_pending(); __flush_smp_call_function_queue(true); if (local_softirq_pending()) do_softirq_post_smp_call_flush(was_pending); local_irq_restore(flags); } /* * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed on other CPUs. * * Returns 0 on success, else a negative status code. */ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { call_single_data_t *csd; call_single_data_t csd_stack = { .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }, }; int this_cpu; int err; /* * Prevent preemption and reschedule on another CPU, as well as CPU * removal. This prevents stopper from running on this CPU, thus * providing mutual exclusion of the below cpu_online() check and * IPI sending ensuring IPI are not missed by CPU going offline. */ this_cpu = get_cpu(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress); /* * When @wait we can deadlock when we interrupt between llist_add() and * arch_send_call_function_ipi*(); when !@wait we can deadlock due to * csd_lock() on because the interrupt context uses the same csd * storage. */ WARN_ON_ONCE(!in_task()); csd = &csd_stack; if (!wait) { csd = this_cpu_ptr(&csd_data); csd_lock(csd); } csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif err = generic_exec_single(cpu, csd); if (wait) csd_lock_wait(csd); put_cpu(); return err; } EXPORT_SYMBOL(smp_call_function_single); /** * smp_call_function_single_async() - Run an asynchronous function on a * specific CPU. * @cpu: The CPU to run on. * @csd: Pre-allocated and setup data structure * * Like smp_call_function_single(), but the call is asynchonous and * can thus be done from contexts with disabled interrupts. * * The caller passes his own pre-allocated data structure * (ie: embedded in an object) and is responsible for synchronizing it * such that the IPIs performed on the @csd are strictly serialized. * * If the function is called with one csd which has not yet been * processed by previous call to smp_call_function_single_async(), the * function will return immediately with -EBUSY showing that the csd * object is still in progress. * * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. * * Return: %0 on success or negative errno value on error */ int smp_call_function_single_async(int cpu, call_single_data_t *csd) { int err = 0; preempt_disable(); if (csd->node.u_flags & CSD_FLAG_LOCK) { err = -EBUSY; goto out; } csd->node.u_flags = CSD_FLAG_LOCK; smp_wmb(); err = generic_exec_single(cpu, csd); out: preempt_enable(); return err; } EXPORT_SYMBOL_GPL(smp_call_function_single_async); /* * smp_call_function_any - Run a function on any of the given cpus * @mask: The mask of cpus it can run on. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed. * * Returns 0 on success, else a negative status code (if no cpus were online). * * Selection preference: * 1) current cpu if in @mask * 2) nearest cpu in @mask, based on NUMA topology */ int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) { unsigned int cpu; int ret; /* Try for same CPU (cheapest) */ cpu = get_cpu(); if (!cpumask_test_cpu(cpu, mask)) cpu = sched_numa_find_nth_cpu(mask, 0, cpu_to_node(cpu)); ret = smp_call_function_single(cpu, func, info, wait); put_cpu(); return ret; } EXPORT_SYMBOL_GPL(smp_call_function_any); /* * Flags to be used as scf_flags argument of smp_call_function_many_cond(). * * %SCF_WAIT: Wait until function execution is completed * %SCF_RUN_LOCAL: Run also locally if local cpu is set in cpumask */ #define SCF_WAIT (1U << 0) #define SCF_RUN_LOCAL (1U << 1) static void smp_call_function_many_cond(const struct cpumask *mask, smp_call_func_t func, void *info, unsigned int scf_flags, smp_cond_func_t cond_func) { int cpu, last_cpu, this_cpu = smp_processor_id(); struct call_function_data *cfd; bool wait = scf_flags & SCF_WAIT; int nr_cpus = 0; bool run_remote = false; lockdep_assert_preemption_disabled(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ if (cpu_online(this_cpu) && !oops_in_progress && !early_boot_irqs_disabled) lockdep_assert_irqs_enabled(); /* * When @wait we can deadlock when we interrupt between llist_add() and * arch_send_call_function_ipi*(); when !@wait we can deadlock due to * csd_lock() on because the interrupt context uses the same csd * storage. */ WARN_ON_ONCE(!in_task()); /* Check if we need remote execution, i.e., any CPU excluding this one. */ if (cpumask_any_and_but(mask, cpu_online_mask, this_cpu) < nr_cpu_ids) { cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); __cpumask_clear_cpu(this_cpu, cfd->cpumask); cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu); if (cond_func && !cond_func(cpu, info)) { __cpumask_clear_cpu(cpu, cfd->cpumask); continue; } /* Work is enqueued on a remote CPU. */ run_remote = true; csd_lock(csd); if (wait) csd->node.u_flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif trace_csd_queue_cpu(cpu, _RET_IP_, func, csd); /* * Kick the remote CPU if this is the first work * item enqueued. */ if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { __cpumask_set_cpu(cpu, cfd->cpumask_ipi); nr_cpus++; last_cpu = cpu; } } /* * Choose the most efficient way to send an IPI. Note that the * number of CPUs might be zero due to concurrent changes to the * provided mask. */ if (nr_cpus == 1) send_call_function_single_ipi(last_cpu); else if (likely(nr_cpus > 1)) send_call_function_ipi_mask(cfd->cpumask_ipi); } /* Check if we need local execution. */ if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) && (!cond_func || cond_func(this_cpu, info))) { unsigned long flags; local_irq_save(flags); csd_do_func(func, info, NULL); local_irq_restore(flags); } if (run_remote && wait) { for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd; csd = per_cpu_ptr(cfd->csd, cpu); csd_lock_wait(csd); } } } /** * smp_call_function_many(): Run a function on a set of CPUs. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed * on other CPUs. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. * * @func is not called on the local CPU even if @mask contains it. Consider * using on_each_cpu_cond_mask() instead if this is not desirable. */ void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL); } EXPORT_SYMBOL(smp_call_function_many); /** * smp_call_function(): Run a function on all other CPUs. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed * on other CPUs. * * Returns 0. * * If @wait is true, then returns once @func has returned; otherwise * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ void smp_call_function(smp_call_func_t func, void *info, int wait) { preempt_disable(); smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); } EXPORT_SYMBOL(smp_call_function); /* Setup configured maximum number of CPUs to activate */ unsigned int setup_max_cpus = NR_CPUS; EXPORT_SYMBOL(setup_max_cpus); /* * Setup routine for controlling SMP activation * * Command-line option of "nosmp" or "maxcpus=0" will disable SMP * activation entirely (the MPS table probe still happens, though). * * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer * greater than 0, limits the maximum number of CPUs activated in * SMP mode to <NUM>. */ void __weak __init arch_disable_smp_support(void) { } static int __init nosmp(char *str) { setup_max_cpus = 0; arch_disable_smp_support(); return 0; } early_param("nosmp", nosmp); /* this is hard limit */ static int __init nrcpus(char *str) { int nr_cpus; if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids) set_nr_cpu_ids(nr_cpus); return 0; } early_param("nr_cpus", nrcpus); static int __init maxcpus(char *str) { get_option(&str, &setup_max_cpus); if (setup_max_cpus == 0) arch_disable_smp_support(); return 0; } early_param("maxcpus", maxcpus); #if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS) /* Setup number of possible processor ids */ unsigned int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); #endif /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ void __init setup_nr_cpu_ids(void) { set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1); } /* Called by boot processor to activate the rest. */ void __init smp_init(void) { int num_nodes, num_cpus; idle_threads_init(); cpuhp_threads_init(); pr_info("Bringing up secondary CPUs ...\n"); bringup_nonboot_cpus(setup_max_cpus); num_nodes = num_online_nodes(); num_cpus = num_online_cpus(); pr_info("Brought up %d node%s, %d CPU%s\n", num_nodes, str_plural(num_nodes), num_cpus, str_plural(num_cpus)); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); } /* * on_each_cpu_cond(): Call a function on each processor for which * the supplied function cond_func returns true, optionally waiting * for all the required CPUs to finish. This may include the local * processor. * @cond_func: A callback function that is passed a cpu id and * the info parameter. The function is called * with preemption disabled. The function should * return a boolean value indicating whether to IPI * the specified CPU. * @func: The function to run on all applicable CPUs. * This must be fast and non-blocking. * @info: An arbitrary pointer to pass to both functions. * @wait: If true, wait (atomically) until function has * completed on other CPUs. * * Preemption is disabled to protect against CPUs going offline but not online. * CPUs going online during the call will not be seen or sent an IPI. * * You must not call this function with disabled interrupts or * from a hardware interrupt handler or from a bottom half handler. */ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask) { unsigned int scf_flags = SCF_RUN_LOCAL; if (wait) scf_flags |= SCF_WAIT; preempt_disable(); smp_call_function_many_cond(mask, func, info, scf_flags, cond_func); preempt_enable(); } EXPORT_SYMBOL(on_each_cpu_cond_mask); static void do_nothing(void *unused) { } /** * kick_all_cpus_sync - Force all cpus out of idle * * Used to synchronize the update of pm_idle function pointer. It's * called after the pointer is updated and returns after the dummy * callback function has been executed on all cpus. The execution of * the function can only happen on the remote cpus after they have * left the idle function which had been called via pm_idle function * pointer. So it's guaranteed that nothing uses the previous pointer * anymore. */ void kick_all_cpus_sync(void) { /* Make sure the change is visible before we kick the cpus */ smp_mb(); smp_call_function(do_nothing, NULL, 1); } EXPORT_SYMBOL_GPL(kick_all_cpus_sync); /** * wake_up_all_idle_cpus - break all cpus out of idle * wake_up_all_idle_cpus try to break all cpus which is in idle state even * including idle polling cpus, for non-idle cpus, we will do nothing * for them. */ void wake_up_all_idle_cpus(void) { int cpu; for_each_possible_cpu(cpu) { preempt_disable(); if (cpu != smp_processor_id() && cpu_online(cpu)) wake_up_if_idle(cpu); preempt_enable(); } } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); /** * struct smp_call_on_cpu_struct - Call a function on a specific CPU * @work: &work_struct * @done: &completion to signal * @func: function to call * @data: function's data argument * @ret: return value from @func * @cpu: target CPU (%-1 for any CPU) * * Used to call a function on a specific cpu and wait for it to return. * Optionally make sure the call is done on a specified physical cpu via vcpu * pinning in order to support virtualized environments. */ struct smp_call_on_cpu_struct { struct work_struct work; struct completion done; int (*func)(void *); void *data; int ret; int cpu; }; static void smp_call_on_cpu_callback(struct work_struct *work) { struct smp_call_on_cpu_struct *sscs; sscs = container_of(work, struct smp_call_on_cpu_struct, work); if (sscs->cpu >= 0) hypervisor_pin_vcpu(sscs->cpu); sscs->ret = sscs->func(sscs->data); if (sscs->cpu >= 0) hypervisor_pin_vcpu(-1); complete(&sscs->done); } int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) { struct smp_call_on_cpu_struct sscs = { .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done), .func = func, .data = par, .cpu = phys ? cpu : -1, }; INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback); if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; queue_work_on(cpu, system_wq, &sscs.work); wait_for_completion(&sscs.done); destroy_work_on_stack(&sscs.work); return sscs.ret; } EXPORT_SYMBOL_GPL(smp_call_on_cpu); |
| 14159 77 662 748 45 707 44 86 32 12882 4 814 12885 15556 1563 12897 13245 13267 13243 283 283 283 15545 35 13288 35 13173 13216 15521 662 15540 15567 4961 9844 1 13489 13355 173 15001 14169 14006 14059 14024 1950 12893 12887 749 12897 44 45 12930 98 283 8828 11 25 3 24 8881 12885 12865 12897 12882 190 12877 124 123 1107 52 496 496 83 12087 181 308 1453 1451 1421 46 5 11 1373 662 656 4 112 1372 4 1151 520 1453 1454 1454 125 1446 883 670 76 77 83 82 83 83 81 83 83 83 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Resizable, Scalable, Concurrent Hash Table * * Copyright (c) 2015-2016 Herbert Xu <herbert@gondor.apana.org.au> * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch> * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> * * Code partially derived from nft_hash * Rewritten with rehash code from br_multicast plus single list * pointer as suggested by Josh Triplett * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #ifndef _LINUX_RHASHTABLE_H #define _LINUX_RHASHTABLE_H #include <linux/err.h> #include <linux/errno.h> #include <linux/jhash.h> #include <linux/list_nulls.h> #include <linux/workqueue.h> #include <linux/rculist.h> #include <linux/bit_spinlock.h> #include <linux/rhashtable-types.h> /* * Objects in an rhashtable have an embedded struct rhash_head * which is linked into as hash chain from the hash table - or one * of two or more hash tables when the rhashtable is being resized. * The end of the chain is marked with a special nulls marks which has * the least significant bit set but otherwise stores the address of * the hash bucket. This allows us to be sure we've found the end * of the right list. * The value stored in the hash bucket has BIT(0) used as a lock bit. * This bit must be atomically set before any changes are made to * the chain. To avoid dereferencing this pointer without clearing * the bit first, we use an opaque 'struct rhash_lock_head *' for the * pointer stored in the bucket. This struct needs to be defined so * that rcu_dereference() works on it, but it has no content so a * cast is needed for it to be useful. This ensures it isn't * used by mistake with clearing the lock bit first. */ struct rhash_lock_head {}; /* Maximum chain length before rehash * * The maximum (not average) chain length grows with the size of the hash * table, at a rate of (log N)/(log log N). * * The value of 16 is selected so that even if the hash table grew to * 2^32 you would not expect the maximum chain length to exceed it * unless we are under attack (or extremely unlucky). * * As this limit is only to detect attacks, we don't need to set it to a * lower value as you'd need the chain length to vastly exceed 16 to have * any real effect on the system. */ #define RHT_ELASTICITY 16u /** * struct bucket_table - Table of hash buckets * @size: Number of hash buckets * @nest: Number of bits of first-level nested table. * @rehash: Current bucket being rehashed * @hash_rnd: Random seed to fold into hash * @walkers: List of active walkers * @rcu: RCU structure for freeing the table * @future_tbl: Table under construction during rehashing * @ntbl: Nested table used when out of memory. * @buckets: size * hash buckets */ struct bucket_table { unsigned int size; unsigned int nest; u32 hash_rnd; struct list_head walkers; struct rcu_head rcu; struct bucket_table __rcu *future_tbl; struct lockdep_map dep_map; struct rhash_lock_head __rcu *buckets[] ____cacheline_aligned_in_smp; }; /* * NULLS_MARKER() expects a hash value with the low * bits mostly likely to be significant, and it discards * the msb. * We give it an address, in which the bottom bit is * always 0, and the msb might be significant. * So we shift the address down one bit to align with * expectations and avoid losing a significant bit. * * We never store the NULLS_MARKER in the hash table * itself as we need the lsb for locking. * Instead we store a NULL */ #define RHT_NULLS_MARKER(ptr) \ ((void *)NULLS_MARKER(((unsigned long) (ptr)) >> 1)) #define INIT_RHT_NULLS_HEAD(ptr) \ ((ptr) = NULL) static inline bool rht_is_a_nulls(const struct rhash_head *ptr) { return ((unsigned long) ptr & 1); } static inline void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) { return (char *)he - ht->p.head_offset; } static inline unsigned int rht_bucket_index(const struct bucket_table *tbl, unsigned int hash) { return hash & (tbl->size - 1); } static __always_inline unsigned int rht_key_get_hash(struct rhashtable *ht, const void *key, const struct rhashtable_params params, unsigned int hash_rnd) { unsigned int hash; /* params must be equal to ht->p if it isn't constant. */ if (!__builtin_constant_p(params.key_len)) hash = ht->p.hashfn(key, ht->key_len, hash_rnd); else if (params.key_len) { unsigned int key_len = params.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); else if (key_len & (sizeof(u32) - 1)) hash = jhash(key, key_len, hash_rnd); else hash = jhash2(key, key_len / sizeof(u32), hash_rnd); } else { unsigned int key_len = ht->p.key_len; if (params.hashfn) hash = params.hashfn(key, key_len, hash_rnd); else hash = jhash(key, key_len, hash_rnd); } return hash; } static __always_inline unsigned int rht_key_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const void *key, const struct rhashtable_params params) { unsigned int hash = rht_key_get_hash(ht, key, params, tbl->hash_rnd); return rht_bucket_index(tbl, hash); } static __always_inline unsigned int rht_head_hashfn( struct rhashtable *ht, const struct bucket_table *tbl, const struct rhash_head *he, const struct rhashtable_params params) { const char *ptr = rht_obj(ht, he); return likely(params.obj_hashfn) ? rht_bucket_index(tbl, params.obj_hashfn(ptr, params.key_len ?: ht->p.key_len, tbl->hash_rnd)) : rht_key_hashfn(ht, tbl, ptr + params.key_offset, params); } /** * rht_grow_above_75 - returns true if nelems > 0.75 * table-size * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_75(const struct rhashtable *ht, const struct bucket_table *tbl) { /* Expand table when exceeding 75% load */ return atomic_read(&ht->nelems) > (tbl->size / 4 * 3) && (!ht->p.max_size || tbl->size < ht->p.max_size); } /** * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size * @ht: hash table * @tbl: current table */ static inline bool rht_shrink_below_30(const struct rhashtable *ht, const struct bucket_table *tbl) { /* Shrink table beneath 30% load */ return atomic_read(&ht->nelems) < (tbl->size * 3 / 10) && tbl->size > ht->p.min_size; } /** * rht_grow_above_100 - returns true if nelems > table-size * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_100(const struct rhashtable *ht, const struct bucket_table *tbl) { return atomic_read(&ht->nelems) > tbl->size && (!ht->p.max_size || tbl->size < ht->p.max_size); } /** * rht_grow_above_max - returns true if table is above maximum * @ht: hash table * @tbl: current table */ static inline bool rht_grow_above_max(const struct rhashtable *ht, const struct bucket_table *tbl) { return atomic_read(&ht->nelems) >= ht->max_elems; } #ifdef CONFIG_PROVE_LOCKING int lockdep_rht_mutex_is_held(struct rhashtable *ht); int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash); #else static inline int lockdep_rht_mutex_is_held(struct rhashtable *ht) { return 1; } static inline int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) { return 1; } #endif /* CONFIG_PROVE_LOCKING */ void *rhashtable_insert_slow(struct rhashtable *ht, const void *key, struct rhash_head *obj); void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter); void rhashtable_walk_exit(struct rhashtable_iter *iter); int rhashtable_walk_start_check(struct rhashtable_iter *iter) __acquires(RCU); static inline void rhashtable_walk_start(struct rhashtable_iter *iter) { (void)rhashtable_walk_start_check(iter); } void *rhashtable_walk_next(struct rhashtable_iter *iter); void *rhashtable_walk_peek(struct rhashtable_iter *iter); void rhashtable_walk_stop(struct rhashtable_iter *iter) __releases(RCU); void rhashtable_free_and_destroy(struct rhashtable *ht, void (*free_fn)(void *ptr, void *arg), void *arg); void rhashtable_destroy(struct rhashtable *ht); struct rhash_lock_head __rcu **rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash); struct rhash_lock_head __rcu **__rht_bucket_nested( const struct bucket_table *tbl, unsigned int hash); struct rhash_lock_head __rcu **rht_bucket_nested_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash); #define rht_dereference(p, ht) \ rcu_dereference_protected(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_rcu(p, ht) \ rcu_dereference_all_check(p, lockdep_rht_mutex_is_held(ht)) #define rht_dereference_bucket(p, tbl, hash) \ rcu_dereference_protected(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_dereference_bucket_rcu(p, tbl, hash) \ rcu_dereference_all_check(p, lockdep_rht_bucket_is_held(tbl, hash)) #define rht_entry(tpos, pos, member) \ ({ tpos = container_of(pos, typeof(*tpos), member); 1; }) static inline struct rhash_lock_head __rcu *const *rht_bucket( const struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } static inline struct rhash_lock_head __rcu **rht_bucket_var( struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? __rht_bucket_nested(tbl, hash) : &tbl->buckets[hash]; } static inline struct rhash_lock_head __rcu **rht_bucket_insert( struct rhashtable *ht, struct bucket_table *tbl, unsigned int hash) { return unlikely(tbl->nest) ? rht_bucket_nested_insert(ht, tbl, hash) : &tbl->buckets[hash]; } /* * We lock a bucket by setting BIT(0) in the pointer - this is always * zero in real pointers. The NULLS mark is never stored in the bucket, * rather we store NULL if the bucket is empty. * bit_spin_locks do not handle contention well, but the whole point * of the hashtable design is to achieve minimum per-bucket contention. * A nested hash table might not have a bucket pointer. In that case * we cannot get a lock. For remove and replace the bucket cannot be * interesting and doesn't need locking. * For insert we allocate the bucket if this is the last bucket_table, * and then take the lock. * Sometimes we unlock a bucket by writing a new pointer there. In that * case we don't need to unlock, but we do need to reset state such as * local_bh. For that we have rht_assign_unlock(). As rcu_assign_pointer() * provides the same release semantics that bit_spin_unlock() provides, * this is safe. * When we write to a bucket without unlocking, we use rht_assign_locked(). */ static inline unsigned long rht_lock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt) { unsigned long flags; local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bkt); lock_map_acquire(&tbl->dep_map); return flags; } static inline unsigned long rht_lock_nested(struct bucket_table *tbl, struct rhash_lock_head __rcu **bucket, unsigned int subclass) { unsigned long flags; local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bucket); lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); return flags; } static inline void rht_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt, unsigned long flags) { lock_map_release(&tbl->dep_map); bit_spin_unlock(0, (unsigned long *)bkt); local_irq_restore(flags); } static inline struct rhash_head *__rht_ptr( struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) { return (struct rhash_head *) ((unsigned long)p & ~BIT(0) ?: (unsigned long)RHT_NULLS_MARKER(bkt)); } /* * Where 'bkt' is a bucket and might be locked: * rht_ptr_rcu() dereferences that pointer and clears the lock bit. * rht_ptr() dereferences in a context where the bucket is locked. * rht_ptr_exclusive() dereferences in a context where exclusive * access is guaranteed, such as when destroying the table. */ static inline struct rhash_head *rht_ptr_rcu( struct rhash_lock_head __rcu *const *bkt) { return __rht_ptr(rcu_dereference_all(*bkt), bkt); } static inline struct rhash_head *rht_ptr( struct rhash_lock_head __rcu *const *bkt, struct bucket_table *tbl, unsigned int hash) { return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); } static inline struct rhash_head *rht_ptr_exclusive( struct rhash_lock_head __rcu *const *bkt) { return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); } static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, struct rhash_head *obj) { if (rht_is_a_nulls(obj)) obj = NULL; rcu_assign_pointer(*bkt, (void *)((unsigned long)obj | BIT(0))); } static inline void rht_assign_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt, struct rhash_head *obj, unsigned long flags) { if (rht_is_a_nulls(obj)) obj = NULL; lock_map_release(&tbl->dep_map); rcu_assign_pointer(*bkt, (void *)obj); preempt_enable(); __release(bitlock); local_irq_restore(flags); } /** * rht_for_each_from - iterate over hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index */ #define rht_for_each_from(pos, head, tbl, hash) \ for (pos = head; \ !rht_is_a_nulls(pos); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** * rht_for_each - iterate over hash chain * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index */ #define rht_for_each(pos, tbl, hash) \ rht_for_each_from(pos, rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ tbl, hash) /** * rht_for_each_entry_from - iterate over hash chain from given head * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry_from(tpos, pos, head, tbl, hash, member) \ for (pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket((pos)->next, tbl, hash)) /** * rht_for_each_entry - iterate over hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. */ #define rht_for_each_entry(tpos, pos, tbl, hash, member) \ rht_for_each_entry_from(tpos, pos, \ rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ tbl, hash, member) /** * rht_for_each_entry_safe - safely iterate over hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @next: the &struct rhash_head to use as next in loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive allows for the looped code to * remove the loop cursor from the list. */ #define rht_for_each_entry_safe(tpos, pos, next, tbl, hash, member) \ for (pos = rht_ptr(rht_bucket(tbl, hash), tbl, hash), \ next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = next, \ next = !rht_is_a_nulls(pos) ? \ rht_dereference_bucket(pos->next, tbl, hash) : NULL) /** * rht_for_each_rcu_from - iterate over rcu hash chain from given head * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu_from(pos, head, tbl, hash) \ for (({barrier(); }), \ pos = head; \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_all(pos->next)) /** * rht_for_each_rcu - iterate over rcu hash chain * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_rcu(pos, tbl, hash) \ for (({barrier(); }), \ pos = rht_ptr_rcu(rht_bucket(tbl, hash)); \ !rht_is_a_nulls(pos); \ pos = rcu_dereference_all(pos->next)) /** * rht_for_each_entry_rcu_from - iterated over rcu hash chain from given head * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @head: the &struct rhash_head to start from * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu_from(tpos, pos, head, tbl, hash, member) \ for (({barrier(); }), \ pos = head; \ (!rht_is_a_nulls(pos)) && rht_entry(tpos, pos, member); \ pos = rht_dereference_bucket_rcu(pos->next, tbl, hash)) /** * rht_for_each_entry_rcu - iterate over rcu hash chain of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rhash_head to use as a loop cursor. * @tbl: the &struct bucket_table * @hash: the hash value / bucket index * @member: name of the &struct rhash_head within the hashable struct. * * This hash chain list-traversal primitive may safely run concurrently with * the _rcu mutation primitives such as rhashtable_insert() as long as the * traversal is guarded by rcu_read_lock(). */ #define rht_for_each_entry_rcu(tpos, pos, tbl, hash, member) \ rht_for_each_entry_rcu_from(tpos, pos, \ rht_ptr_rcu(rht_bucket(tbl, hash)), \ tbl, hash, member) /** * rhl_for_each_rcu - iterate over rcu hash table list * @pos: the &struct rlist_head to use as a loop cursor. * @list: the head of the list * * This hash chain list-traversal primitive should be used on the * list returned by rhltable_lookup. */ #define rhl_for_each_rcu(pos, list) \ for (pos = list; pos; pos = rcu_dereference_all(pos->next)) /** * rhl_for_each_entry_rcu - iterate over rcu hash table list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct rlist_head to use as a loop cursor. * @list: the head of the list * @member: name of the &struct rlist_head within the hashable struct. * * This hash chain list-traversal primitive should be used on the * list returned by rhltable_lookup. */ #define rhl_for_each_entry_rcu(tpos, pos, list, member) \ for (pos = list; pos && rht_entry(tpos, pos, member); \ pos = rcu_dereference_all(pos->next)) static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, const void *obj) { struct rhashtable *ht = arg->ht; const char *ptr = obj; return memcmp(ptr + ht->p.key_offset, arg->key, ht->p.key_len); } /* Internal function, do not use. */ static __always_inline struct rhash_head *__rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_lock_head __rcu *const *bkt; struct bucket_table *tbl; struct rhash_head *he; unsigned int hash; tbl = rht_dereference_rcu(ht->tbl, ht); restart: hash = rht_key_hashfn(ht, tbl, key, params); bkt = rht_bucket(tbl, hash); do { rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { if (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, he)) : rhashtable_compare(&arg, rht_obj(ht, he))) continue; return he; } /* An object might have been moved to a different hash chain, * while we walk along it - better check and retry. */ } while (he != RHT_NULLS_MARKER(bkt)); /* Ensure we see any new tables. */ smp_rmb(); tbl = rht_dereference_rcu(tbl->future_tbl, ht); if (unlikely(tbl)) goto restart; return NULL; } /** * rhashtable_lookup - search hash table * @ht: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for an entry with an identical key. The first matching entry is returned. * * This must only be called under the RCU read lock. * * Returns the first entry on which the compare function returned true. */ static __always_inline void *rhashtable_lookup( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { struct rhash_head *he = __rhashtable_lookup(ht, key, params); return he ? rht_obj(ht, he) : NULL; } /** * rhashtable_lookup_fast - search hash table, without RCU read lock * @ht: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for an entry with an identical key. The first matching entry is returned. * * Only use this function when you have other mechanisms guaranteeing * that the object won't go away after the RCU read lock is released. * * Returns the first entry on which the compare function returned true. */ static __always_inline void *rhashtable_lookup_fast( struct rhashtable *ht, const void *key, const struct rhashtable_params params) { void *obj; rcu_read_lock(); obj = rhashtable_lookup(ht, key, params); rcu_read_unlock(); return obj; } /** * rhltable_lookup - search hash list table * @hlt: hash table * @key: the pointer to the key * @params: hash table parameters * * Computes the hash value for the key and traverses the bucket chain looking * for an entry with an identical key. All matching entries are returned * in a list. * * This must only be called under the RCU read lock. * * Returns the list of entries that match the given key. */ static __always_inline struct rhlist_head *rhltable_lookup( struct rhltable *hlt, const void *key, const struct rhashtable_params params) { struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); return he ? container_of(he, struct rhlist_head, rhead) : NULL; } /* Internal function, please use rhashtable_insert_fast() instead. This * function returns the existing element already in hashes if there is a clash, * otherwise it returns an error via ERR_PTR(). */ static __always_inline void *__rhashtable_insert_fast( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct rhashtable_compare_arg arg = { .ht = ht, .key = key, }; struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct bucket_table *tbl; struct rhash_head *head; unsigned long flags; unsigned int hash; int elasticity; void *data; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); hash = rht_head_hashfn(ht, tbl, obj, params); elasticity = RHT_ELASTICITY; bkt = rht_bucket_insert(ht, tbl, hash); data = ERR_PTR(-ENOMEM); if (!bkt) goto out; pprev = NULL; flags = rht_lock(tbl, bkt); if (unlikely(rcu_access_pointer(tbl->future_tbl))) { slow_path: rht_unlock(tbl, bkt, flags); rcu_read_unlock(); return rhashtable_insert_slow(ht, key, obj); } rht_for_each_from(head, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *plist; struct rhlist_head *list; elasticity--; if (!key || (params.obj_cmpfn ? params.obj_cmpfn(&arg, rht_obj(ht, head)) : rhashtable_compare(&arg, rht_obj(ht, head)))) { pprev = &head->next; continue; } data = rht_obj(ht, head); if (!rhlist) goto out_unlock; list = container_of(obj, struct rhlist_head, rhead); plist = container_of(head, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, plist); head = rht_dereference_bucket(head->next, tbl, hash); RCU_INIT_POINTER(list->rhead.next, head); if (pprev) { rcu_assign_pointer(*pprev, obj); rht_unlock(tbl, bkt, flags); } else rht_assign_unlock(tbl, bkt, obj, flags); data = NULL; goto out; } if (elasticity <= 0) goto slow_path; data = ERR_PTR(-E2BIG); if (unlikely(rht_grow_above_max(ht, tbl))) goto out_unlock; if (unlikely(rht_grow_above_100(ht, tbl))) goto slow_path; /* Inserting at head of list makes unlocking free. */ head = rht_ptr(bkt, tbl, hash); RCU_INIT_POINTER(obj->next, head); if (rhlist) { struct rhlist_head *list; list = container_of(obj, struct rhlist_head, rhead); RCU_INIT_POINTER(list->next, NULL); } atomic_inc(&ht->nelems); rht_assign_unlock(tbl, bkt, obj, flags); if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); data = NULL; out: rcu_read_unlock(); return data; out_unlock: rht_unlock(tbl, bkt, flags); goto out; } /** * rhashtable_insert_fast - insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static __always_inline int rhashtable_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { void *ret; ret = __rhashtable_insert_fast(ht, NULL, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhltable_insert_key - insert object into hash list table * @hlt: hash list table * @key: the pointer to the key * @list: pointer to hash list head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static __always_inline int rhltable_insert_key( struct rhltable *hlt, const void *key, struct rhlist_head *list, const struct rhashtable_params params) { return PTR_ERR(__rhashtable_insert_fast(&hlt->ht, key, &list->rhead, params, true)); } /** * rhltable_insert - insert object into hash list table * @hlt: hash list table * @list: pointer to hash list head inside object * @params: hash table parameters * * Will take the per bucket bitlock to protect against mutual mutations * on the same bucket. Multiple insertions may occur in parallel unless * they map to the same bucket. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static __always_inline int rhltable_insert( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { const char *key = rht_obj(&hlt->ht, &list->rhead); key += params.key_offset; return rhltable_insert_key(hlt, key, list, params); } /** * rhashtable_lookup_insert_fast - lookup and insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * This lookup function may only be used for fixed key hash table (key_len * parameter set). It will BUG() if used inappropriately. * * It is safe to call this function from atomic context. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. */ static __always_inline int rhashtable_lookup_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); void *ret; BUG_ON(ht->p.obj_hashfn); ret = __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Just like rhashtable_lookup_insert_fast(), but this function returns the * object if it exists, NULL if it did not and the insertion was successful, * and an ERR_PTR otherwise. */ static __always_inline void *rhashtable_lookup_get_insert_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { const char *key = rht_obj(ht, obj); BUG_ON(ht->p.obj_hashfn); return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, false); } /** * rhashtable_lookup_insert_key - search and insert object to hash table * with explicit key * @ht: hash table * @key: key * @obj: pointer to hash head inside object * @params: hash table parameters * * Lookups may occur in parallel with hashtable mutations and resizing. * * Will trigger an automatic deferred table resizing if residency in the * table grows beyond 70%. * * Returns zero on success. */ static __always_inline int rhashtable_lookup_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { void *ret; BUG_ON(!ht->p.obj_hashfn || !key); ret = __rhashtable_insert_fast(ht, key, obj, params, false); if (IS_ERR(ret)) return PTR_ERR(ret); return ret == NULL ? 0 : -EEXIST; } /** * rhashtable_lookup_get_insert_key - lookup and insert object into hash table * @ht: hash table * @key: key * @obj: pointer to hash head inside object * @params: hash table parameters * * Just like rhashtable_lookup_insert_key(), but this function returns the * object if it exists, NULL if it does not and the insertion was successful, * and an ERR_PTR otherwise. */ static __always_inline void *rhashtable_lookup_get_insert_key( struct rhashtable *ht, const void *key, struct rhash_head *obj, const struct rhashtable_params params) { BUG_ON(!ht->p.obj_hashfn || !key); return __rhashtable_insert_fast(ht, key, obj, params, false); } /* Internal function, please use rhashtable_remove_fast() instead */ static __always_inline int __rhashtable_remove_fast_one( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; unsigned long flags; unsigned int hash; int err = -ENOENT; hash = rht_head_hashfn(ht, tbl, obj, params); bkt = rht_bucket_var(tbl, hash); if (!bkt) return -ENOENT; pprev = NULL; flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *list; list = container_of(he, struct rhlist_head, rhead); if (he != obj) { struct rhlist_head __rcu **lpprev; pprev = &he->next; if (!rhlist) continue; do { lpprev = &list->next; list = rht_dereference_bucket(list->next, tbl, hash); } while (list && obj != &list->rhead); if (!list) continue; list = rht_dereference_bucket(list->next, tbl, hash); RCU_INIT_POINTER(*lpprev, list); err = 0; break; } obj = rht_dereference_bucket(obj->next, tbl, hash); err = 1; if (rhlist) { list = rht_dereference_bucket(list->next, tbl, hash); if (list) { RCU_INIT_POINTER(list->rhead.next, obj); obj = &list->rhead; err = 0; } } if (pprev) { rcu_assign_pointer(*pprev, obj); rht_unlock(tbl, bkt, flags); } else { rht_assign_unlock(tbl, bkt, obj, flags); } goto unlocked; } rht_unlock(tbl, bkt, flags); unlocked: if (err > 0) { atomic_dec(&ht->nelems); if (unlikely(ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))) schedule_work(&ht->run_work); err = 0; } return err; } /* Internal function, please use rhashtable_remove_fast() instead */ static __always_inline int __rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params, bool rhlist) { struct bucket_table *tbl; int err; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); /* Because we have already taken (and released) the bucket * lock in old_tbl, if we find that future_tbl is not yet * visible then that guarantees the entry to still be in * the old tbl if it exists. */ while ((err = __rhashtable_remove_fast_one(ht, tbl, obj, params, rhlist)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; rcu_read_unlock(); return err; } /** * rhashtable_remove_fast - remove object from hash table * @ht: hash table * @obj: pointer to hash head inside object * @params: hash table parameters * * Since the hash chain is single linked, the removal operation needs to * walk the bucket chain upon removal. The removal operation is thus * considerable slow if the hash table is not correctly sized. * * Will automatically shrink the table if permitted when residency drops * below 30%. * * Returns zero on success, -ENOENT if the entry could not be found. */ static __always_inline int rhashtable_remove_fast( struct rhashtable *ht, struct rhash_head *obj, const struct rhashtable_params params) { return __rhashtable_remove_fast(ht, obj, params, false); } /** * rhltable_remove - remove object from hash list table * @hlt: hash list table * @list: pointer to hash list head inside object * @params: hash table parameters * * Since the hash chain is single linked, the removal operation needs to * walk the bucket chain upon removal. The removal operation is thus * considerably slower if the hash table is not correctly sized. * * Will automatically shrink the table if permitted when residency drops * below 30% * * Returns zero on success, -ENOENT if the entry could not be found. */ static __always_inline int rhltable_remove( struct rhltable *hlt, struct rhlist_head *list, const struct rhashtable_params params) { return __rhashtable_remove_fast(&hlt->ht, &list->rhead, params, true); } /* Internal function, please use rhashtable_replace_fast() instead */ static __always_inline int __rhashtable_replace_fast( struct rhashtable *ht, struct bucket_table *tbl, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) { struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; unsigned long flags; unsigned int hash; int err = -ENOENT; /* Minimally, the old and new objects must have same hash * (which should mean identifiers are the same). */ hash = rht_head_hashfn(ht, tbl, obj_old, params); if (hash != rht_head_hashfn(ht, tbl, obj_new, params)) return -EINVAL; bkt = rht_bucket_var(tbl, hash); if (!bkt) return -ENOENT; pprev = NULL; flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { if (he != obj_old) { pprev = &he->next; continue; } rcu_assign_pointer(obj_new->next, obj_old->next); if (pprev) { rcu_assign_pointer(*pprev, obj_new); rht_unlock(tbl, bkt, flags); } else { rht_assign_unlock(tbl, bkt, obj_new, flags); } err = 0; goto unlocked; } rht_unlock(tbl, bkt, flags); unlocked: return err; } /** * rhashtable_replace_fast - replace an object in hash table * @ht: hash table * @obj_old: pointer to hash head inside object being replaced * @obj_new: pointer to hash head inside object which is new * @params: hash table parameters * * Replacing an object doesn't affect the number of elements in the hash table * or bucket, so we don't need to worry about shrinking or expanding the * table here. * * Returns zero on success, -ENOENT if the entry could not be found, * -EINVAL if hash is not the same for the old and new objects. */ static __always_inline int rhashtable_replace_fast( struct rhashtable *ht, struct rhash_head *obj_old, struct rhash_head *obj_new, const struct rhashtable_params params) { struct bucket_table *tbl; int err; rcu_read_lock(); tbl = rht_dereference_rcu(ht->tbl, ht); /* Because we have already taken (and released) the bucket * lock in old_tbl, if we find that future_tbl is not yet * visible then that guarantees the entry to still be in * the old tbl if it exists. */ while ((err = __rhashtable_replace_fast(ht, tbl, obj_old, obj_new, params)) && (tbl = rht_dereference_rcu(tbl->future_tbl, ht))) ; rcu_read_unlock(); return err; } /** * rhltable_walk_enter - Initialise an iterator * @hlt: Table to walk over * @iter: Hash table Iterator * * This function prepares a hash table walk. * * Note that if you restart a walk after rhashtable_walk_stop you * may see the same object twice. Also, you may miss objects if * there are removals in between rhashtable_walk_stop and the next * call to rhashtable_walk_start. * * For a completely stable walk you should construct your own data * structure outside the hash table. * * This function may be called from any process context, including * non-preemptable context, but cannot be called from softirq or * hardirq context. * * You must call rhashtable_walk_exit after this function returns. */ static inline void rhltable_walk_enter(struct rhltable *hlt, struct rhashtable_iter *iter) { rhashtable_walk_enter(&hlt->ht, iter); } /** * rhltable_free_and_destroy - free elements and destroy hash list table * @hlt: the hash list table to destroy * @free_fn: callback to release resources of element * @arg: pointer passed to free_fn * * See documentation for rhashtable_free_and_destroy. */ static inline void rhltable_free_and_destroy(struct rhltable *hlt, void (*free_fn)(void *ptr, void *arg), void *arg) { rhashtable_free_and_destroy(&hlt->ht, free_fn, arg); } static inline void rhltable_destroy(struct rhltable *hlt) { rhltable_free_and_destroy(hlt, NULL, NULL); } #endif /* _LINUX_RHASHTABLE_H */ |
| 23 3 5 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM ksm #if !defined(_TRACE_KSM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KSM_H #include <linux/tracepoint.h> /** * ksm_scan_template - called for start / stop scan * * @seq: sequence number of scan * @rmap_entries: actual number of rmap entries * * Allows to trace the start / stop of a ksm scan. */ DECLARE_EVENT_CLASS(ksm_scan_template, TP_PROTO(int seq, u32 rmap_entries), TP_ARGS(seq, rmap_entries), TP_STRUCT__entry( __field(int, seq) __field(u32, rmap_entries) ), TP_fast_assign( __entry->seq = seq; __entry->rmap_entries = rmap_entries; ), TP_printk("seq %d rmap size %d", __entry->seq, __entry->rmap_entries) ); /** * ksm_start_scan - called after a new ksm scan is started * * @seq: sequence number of scan * @rmap_entries: actual number of rmap entries * * Allows to trace the start of a ksm scan. */ DEFINE_EVENT(ksm_scan_template, ksm_start_scan, TP_PROTO(int seq, u32 rmap_entries), TP_ARGS(seq, rmap_entries) ); /** * ksm_stop_scan - called after a new ksm scan has completed * * @seq: sequence number of scan * @rmap_entries: actual number of rmap entries * * Allows to trace the completion of a ksm scan. */ DEFINE_EVENT(ksm_scan_template, ksm_stop_scan, TP_PROTO(int seq, u32 rmap_entries), TP_ARGS(seq, rmap_entries) ); /** * ksm_enter - called after a new process has been added / removed from ksm * * @mm: address of the mm object of the process * * Allows to trace the when a process has been added or removed from ksm. */ DECLARE_EVENT_CLASS(ksm_enter_exit_template, TP_PROTO(void *mm), TP_ARGS(mm), TP_STRUCT__entry( __field(void *, mm) ), TP_fast_assign( __entry->mm = mm; ), TP_printk("mm %p", __entry->mm) ); /** * ksm_enter - called after a new process has been added to ksm * * @mm: address of the mm object of the process * * Allows to trace the when a process has been added to ksm. */ DEFINE_EVENT(ksm_enter_exit_template, ksm_enter, TP_PROTO(void *mm), TP_ARGS(mm) ); /** * ksm_exit - called after a new process has been removed from ksm * * @mm: address of the mm object of the process * * Allows to trace the when a process has been removed from ksm. */ DEFINE_EVENT(ksm_enter_exit_template, ksm_exit, TP_PROTO(void *mm), TP_ARGS(mm) ); /** * ksm_merge_one_page - called after a page has been merged * * @pfn: page frame number of ksm page * @rmap_item: address of rmap_item object * @mm: address of the process mm struct * @err: success * * Allows to trace the ksm merging of individual pages. */ TRACE_EVENT(ksm_merge_one_page, TP_PROTO(unsigned long pfn, void *rmap_item, void *mm, int err), TP_ARGS(pfn, rmap_item, mm, err), TP_STRUCT__entry( __field(unsigned long, pfn) __field(void *, rmap_item) __field(void *, mm) __field(int, err) ), TP_fast_assign( __entry->pfn = pfn; __entry->rmap_item = rmap_item; __entry->mm = mm; __entry->err = err; ), TP_printk("ksm pfn %lu rmap_item %p mm %p error %d", __entry->pfn, __entry->rmap_item, __entry->mm, __entry->err) ); /** * ksm_merge_with_ksm_page - called after a page has been merged with a ksm page * * @ksm_page: address ksm page * @pfn: page frame number of ksm page * @rmap_item: address of rmap_item object * @mm: address of the mm object of the process * @err: success * * Allows to trace the merging of a page with a ksm page. */ TRACE_EVENT(ksm_merge_with_ksm_page, TP_PROTO(void *ksm_page, unsigned long pfn, void *rmap_item, void *mm, int err), TP_ARGS(ksm_page, pfn, rmap_item, mm, err), TP_STRUCT__entry( __field(void *, ksm_page) __field(unsigned long, pfn) __field(void *, rmap_item) __field(void *, mm) __field(int, err) ), TP_fast_assign( __entry->ksm_page = ksm_page; __entry->pfn = pfn; __entry->rmap_item = rmap_item; __entry->mm = mm; __entry->err = err; ), TP_printk("%spfn %lu rmap_item %p mm %p error %d", (__entry->ksm_page ? "ksm " : ""), __entry->pfn, __entry->rmap_item, __entry->mm, __entry->err) ); /** * ksm_remove_ksm_page - called after a ksm page has been removed * * @pfn: page frame number of ksm page * * Allows to trace the removing of stable ksm pages. */ TRACE_EVENT(ksm_remove_ksm_page, TP_PROTO(unsigned long pfn), TP_ARGS(pfn), TP_STRUCT__entry( __field(unsigned long, pfn) ), TP_fast_assign( __entry->pfn = pfn; ), TP_printk("pfn %lu", __entry->pfn) ); /** * ksm_remove_rmap_item - called after a rmap_item has been removed from the * stable tree * * @pfn: page frame number of ksm page * @rmap_item: address of rmap_item object * @mm: address of the process mm struct * * Allows to trace the removal of pages from the stable tree list. */ TRACE_EVENT(ksm_remove_rmap_item, TP_PROTO(unsigned long pfn, void *rmap_item, void *mm), TP_ARGS(pfn, rmap_item, mm), TP_STRUCT__entry( __field(unsigned long, pfn) __field(void *, rmap_item) __field(void *, mm) ), TP_fast_assign( __entry->pfn = pfn; __entry->rmap_item = rmap_item; __entry->mm = mm; ), TP_printk("pfn %lu rmap_item %p mm %p", __entry->pfn, __entry->rmap_item, __entry->mm) ); /** * ksm_advisor - called after the advisor has run * * @scan_time: scan time in seconds * @pages_to_scan: new pages_to_scan value * @cpu_percent: cpu usage in percent * * Allows to trace the ksm advisor. */ TRACE_EVENT(ksm_advisor, TP_PROTO(s64 scan_time, unsigned long pages_to_scan, unsigned int cpu_percent), TP_ARGS(scan_time, pages_to_scan, cpu_percent), TP_STRUCT__entry( __field(s64, scan_time) __field(unsigned long, pages_to_scan) __field(unsigned int, cpu_percent) ), TP_fast_assign( __entry->scan_time = scan_time; __entry->pages_to_scan = pages_to_scan; __entry->cpu_percent = cpu_percent; ), TP_printk("ksm scan time %lld pages_to_scan %lu cpu percent %u", __entry->scan_time, __entry->pages_to_scan, __entry->cpu_percent) ); #endif /* _TRACE_KSM_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 7 3 28 27 3 14 13 33 15 13 18 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 | /* * Copyright (c) 2016 Tom Herbert <tom@herbertland.com> * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef _TLS_INT_H #define _TLS_INT_H #include <asm/byteorder.h> #include <linux/types.h> #include <linux/skmsg.h> #include <net/tls.h> #include <net/tls_prot.h> #define TLS_PAGE_ORDER (min_t(unsigned int, PAGE_ALLOC_COSTLY_ORDER, \ TLS_MAX_PAYLOAD_SIZE >> PAGE_SHIFT)) #define __TLS_INC_STATS(net, field) \ __SNMP_INC_STATS((net)->mib.tls_statistics, field) #define TLS_INC_STATS(net, field) \ SNMP_INC_STATS((net)->mib.tls_statistics, field) #define TLS_DEC_STATS(net, field) \ SNMP_DEC_STATS((net)->mib.tls_statistics, field) struct tls_cipher_desc { unsigned int nonce; unsigned int iv; unsigned int key; unsigned int salt; unsigned int tag; unsigned int rec_seq; unsigned int iv_offset; unsigned int key_offset; unsigned int salt_offset; unsigned int rec_seq_offset; char *cipher_name; bool offloadable; size_t crypto_info; }; #define TLS_CIPHER_MIN TLS_CIPHER_AES_GCM_128 #define TLS_CIPHER_MAX TLS_CIPHER_ARIA_GCM_256 extern const struct tls_cipher_desc tls_cipher_desc[TLS_CIPHER_MAX + 1 - TLS_CIPHER_MIN]; static inline const struct tls_cipher_desc *get_cipher_desc(u16 cipher_type) { if (cipher_type < TLS_CIPHER_MIN || cipher_type > TLS_CIPHER_MAX) return NULL; return &tls_cipher_desc[cipher_type - TLS_CIPHER_MIN]; } static inline char *crypto_info_iv(struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc) { return (char *)crypto_info + cipher_desc->iv_offset; } static inline char *crypto_info_key(struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc) { return (char *)crypto_info + cipher_desc->key_offset; } static inline char *crypto_info_salt(struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc) { return (char *)crypto_info + cipher_desc->salt_offset; } static inline char *crypto_info_rec_seq(struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc) { return (char *)crypto_info + cipher_desc->rec_seq_offset; } /* TLS records are maintained in 'struct tls_rec'. It stores the memory pages * allocated or mapped for each TLS record. After encryption, the records are * stores in a linked list. */ struct tls_rec { struct list_head list; int tx_ready; int tx_flags; struct sk_msg msg_plaintext; struct sk_msg msg_encrypted; /* AAD | msg_plaintext.sg.data | sg_tag */ struct scatterlist sg_aead_in[2]; /* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */ struct scatterlist sg_aead_out[2]; char content_type; struct scatterlist sg_content_type; struct sock *sk; char aad_space[TLS_AAD_SPACE_SIZE]; u8 iv_data[TLS_MAX_IV_SIZE]; /* Must be last --ends in a flexible-array member. */ struct aead_request aead_req; }; int __net_init tls_proc_init(struct net *net); void __net_exit tls_proc_fini(struct net *net); struct tls_context *tls_ctx_create(struct sock *sk); void tls_ctx_free(struct sock *sk, struct tls_context *ctx); void update_sk_prot(struct sock *sk, struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); void tls_err_abort(struct sock *sk, int err); void tls_strp_abort_strp(struct tls_strparser *strp, int err); int init_prot_info(struct tls_prot_info *prot, const struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc); int tls_set_sw_offload(struct sock *sk, int tx, struct tls_crypto_info *new_crypto_info); void tls_update_rx_zc_capable(struct tls_context *tls_ctx); void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); void tls_sw_strparser_done(struct tls_context *tls_ctx); int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); void tls_sw_splice_eof(struct socket *sock); void tls_sw_cancel_work_tx(struct tls_context *tls_ctx); void tls_sw_release_resources_tx(struct sock *sk); void tls_sw_free_ctx_tx(struct tls_context *tls_ctx); void tls_sw_free_resources_rx(struct sock *sk); void tls_sw_release_resources_rx(struct sock *sk); void tls_sw_free_ctx_rx(struct tls_context *tls_ctx); int tls_sw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len); bool tls_sw_sock_is_readable(struct sock *sk); ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); int tls_sw_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t read_actor); int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); void tls_device_splice_eof(struct socket *sock); int tls_tx_records(struct sock *sk, int flags); void tls_sw_write_space(struct sock *sk, struct tls_context *ctx); void tls_device_write_space(struct sock *sk, struct tls_context *ctx); int tls_process_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type); int decrypt_skb(struct sock *sk, struct scatterlist *sgout); int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); int tls_strp_dev_init(void); void tls_strp_dev_exit(void); void tls_strp_done(struct tls_strparser *strp); void tls_strp_stop(struct tls_strparser *strp); int tls_strp_init(struct tls_strparser *strp, struct sock *sk); void tls_strp_data_ready(struct tls_strparser *strp); void tls_strp_check_rcv(struct tls_strparser *strp); void tls_strp_msg_done(struct tls_strparser *strp); int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb); void tls_rx_msg_ready(struct tls_strparser *strp); bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); int tls_strp_msg_cow(struct tls_sw_context_rx *ctx); struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx); int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst); static inline struct tls_msg *tls_msg(struct sk_buff *skb) { struct sk_skb_cb *scb = (struct sk_skb_cb *)skb->cb; return &scb->tls; } static inline struct sk_buff *tls_strp_msg(struct tls_sw_context_rx *ctx) { DEBUG_NET_WARN_ON_ONCE(!ctx->strp.msg_ready || !ctx->strp.anchor->len); return ctx->strp.anchor; } static inline bool tls_strp_msg_ready(struct tls_sw_context_rx *ctx) { return READ_ONCE(ctx->strp.msg_ready); } static inline bool tls_strp_msg_mixed_decrypted(struct tls_sw_context_rx *ctx) { return ctx->strp.mixed_decrypted; } #ifdef CONFIG_TLS_DEVICE int tls_device_init(void); void tls_device_cleanup(void); int tls_set_device_offload(struct sock *sk); void tls_device_free_resources_tx(struct sock *sk); int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx); void tls_device_offload_cleanup_rx(struct sock *sk); void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq); int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx); #else static inline int tls_device_init(void) { return 0; } static inline void tls_device_cleanup(void) {} static inline int tls_set_device_offload(struct sock *sk) { return -EOPNOTSUPP; } static inline void tls_device_free_resources_tx(struct sock *sk) {} static inline int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) { return -EOPNOTSUPP; } static inline void tls_device_offload_cleanup_rx(struct sock *sk) {} static inline void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) {} static inline int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx) { return 0; } #endif int tls_push_sg(struct sock *sk, struct tls_context *ctx, struct scatterlist *sg, u16 first_offset, int flags); int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, int flags); void tls_free_partial_record(struct sock *sk, struct tls_context *ctx); static inline bool tls_is_partially_sent_record(struct tls_context *ctx) { return !!ctx->partially_sent_record; } static inline bool tls_is_pending_open_record(struct tls_context *tls_ctx) { return tls_ctx->pending_open_record_frags; } static inline bool tls_bigint_increment(unsigned char *seq, int len) { int i; for (i = len - 1; i >= 0; i--) { ++seq[i]; if (seq[i] != 0) break; } return (i == -1); } static inline void tls_bigint_subtract(unsigned char *seq, int n) { u64 rcd_sn; __be64 *p; BUILD_BUG_ON(TLS_MAX_REC_SEQ_SIZE != 8); p = (__be64 *)seq; rcd_sn = be64_to_cpu(*p); *p = cpu_to_be64(rcd_sn - n); } static inline void tls_advance_record_sn(struct sock *sk, struct tls_prot_info *prot, struct cipher_context *ctx) { if (tls_bigint_increment(ctx->rec_seq, prot->rec_seq_size)) tls_err_abort(sk, -EBADMSG); if (prot->version != TLS_1_3_VERSION && prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) tls_bigint_increment(ctx->iv + prot->salt_size, prot->iv_size); } static inline void tls_xor_iv_with_seq(struct tls_prot_info *prot, char *iv, char *seq) { int i; if (prot->version == TLS_1_3_VERSION || prot->cipher_type == TLS_CIPHER_CHACHA20_POLY1305) { for (i = 0; i < 8; i++) iv[i + 4] ^= seq[i]; } } static inline void tls_fill_prepend(struct tls_context *ctx, char *buf, size_t plaintext_len, unsigned char record_type) { struct tls_prot_info *prot = &ctx->prot_info; size_t pkt_len, iv_size = prot->iv_size; pkt_len = plaintext_len + prot->tag_size; if (prot->version != TLS_1_3_VERSION && prot->cipher_type != TLS_CIPHER_CHACHA20_POLY1305) { pkt_len += iv_size; memcpy(buf + TLS_NONCE_OFFSET, ctx->tx.iv + prot->salt_size, iv_size); } /* we cover nonce explicit here as well, so buf should be of * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE */ buf[0] = prot->version == TLS_1_3_VERSION ? TLS_RECORD_TYPE_DATA : record_type; /* Note that VERSION must be TLS_1_2 for both TLS1.2 and TLS1.3 */ buf[1] = TLS_1_2_VERSION_MINOR; buf[2] = TLS_1_2_VERSION_MAJOR; /* we can use IV for nonce explicit according to spec */ buf[3] = pkt_len >> 8; buf[4] = pkt_len & 0xFF; } static inline void tls_make_aad(char *buf, size_t size, char *record_sequence, unsigned char record_type, struct tls_prot_info *prot) { if (prot->version != TLS_1_3_VERSION) { memcpy(buf, record_sequence, prot->rec_seq_size); buf += 8; } else { size += prot->tag_size; } buf[0] = prot->version == TLS_1_3_VERSION ? TLS_RECORD_TYPE_DATA : record_type; buf[1] = TLS_1_2_VERSION_MAJOR; buf[2] = TLS_1_2_VERSION_MINOR; buf[3] = size >> 8; buf[4] = size & 0xFF; } #endif |
| 12 158 62 467 239 10 361 277 417 359 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Cryptographic scatter and gather helpers. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2002 Adam J. Richter <adam@yggdrasil.com> * Copyright (c) 2004 Jean-Luc Cooke <jlcooke@certainkey.com> * Copyright (c) 2007 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_SCATTERWALK_H #define _CRYPTO_SCATTERWALK_H #include <linux/errno.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/scatterlist.h> #include <linux/types.h> struct scatter_walk { /* Must be the first member, see struct skcipher_walk. */ union { void *const addr; /* Private API field, do not touch. */ union crypto_no_such_thing *__addr; }; struct scatterlist *sg; unsigned int offset; }; struct skcipher_walk { union { /* Virtual address of the source. */ struct { struct { const void *const addr; } virt; } src; /* Private field for the API, do not use. */ struct scatter_walk in; }; union { /* Virtual address of the destination. */ struct { struct { void *const addr; } virt; } dst; /* Private field for the API, do not use. */ struct scatter_walk out; }; unsigned int nbytes; unsigned int total; u8 *page; u8 *buffer; u8 *oiv; void *iv; unsigned int ivsize; int flags; unsigned int blocksize; unsigned int stride; unsigned int alignmask; }; static inline void scatterwalk_crypto_chain(struct scatterlist *head, struct scatterlist *sg, int num) { if (sg) sg_chain(head, num, sg); else sg_mark_end(head); } static inline void scatterwalk_start(struct scatter_walk *walk, struct scatterlist *sg) { walk->sg = sg; walk->offset = sg->offset; } /* * This is equivalent to scatterwalk_start(walk, sg) followed by * scatterwalk_skip(walk, pos). */ static inline void scatterwalk_start_at_pos(struct scatter_walk *walk, struct scatterlist *sg, unsigned int pos) { while (pos > sg->length) { pos -= sg->length; sg = sg_next(sg); } walk->sg = sg; walk->offset = sg->offset + pos; } static inline unsigned int scatterwalk_clamp(struct scatter_walk *walk, unsigned int nbytes) { unsigned int len_this_sg; unsigned int limit; if (walk->offset >= walk->sg->offset + walk->sg->length) scatterwalk_start(walk, sg_next(walk->sg)); len_this_sg = walk->sg->offset + walk->sg->length - walk->offset; /* * HIGHMEM case: the page may have to be mapped into memory. To avoid * the complexity of having to map multiple pages at once per sg entry, * clamp the returned length to not cross a page boundary. * * !HIGHMEM case: no mapping is needed; all pages of the sg entry are * already mapped contiguously in the kernel's direct map. For improved * performance, allow the walker to return data segments that cross a * page boundary. Do still cap the length to PAGE_SIZE, since some * users rely on that to avoid disabling preemption for too long when * using SIMD. It's also needed for when skcipher_walk uses a bounce * page due to the data not being aligned to the algorithm's alignmask. */ if (IS_ENABLED(CONFIG_HIGHMEM)) limit = PAGE_SIZE - offset_in_page(walk->offset); else limit = PAGE_SIZE; return min3(nbytes, len_this_sg, limit); } /* * Create a scatterlist that represents the remaining data in a walk. Uses * chaining to reference the original scatterlist, so this uses at most two * entries in @sg_out regardless of the number of entries in the original list. * Assumes that sg_init_table() was already done. */ static inline void scatterwalk_get_sglist(struct scatter_walk *walk, struct scatterlist sg_out[2]) { if (walk->offset >= walk->sg->offset + walk->sg->length) scatterwalk_start(walk, sg_next(walk->sg)); sg_set_page(sg_out, sg_page(walk->sg), walk->sg->offset + walk->sg->length - walk->offset, walk->offset); scatterwalk_crypto_chain(sg_out, sg_next(walk->sg), 2); } static inline void scatterwalk_map(struct scatter_walk *walk) { struct page *base_page = sg_page(walk->sg); unsigned int offset = walk->offset; void *addr; if (IS_ENABLED(CONFIG_HIGHMEM)) { struct page *page; page = base_page + (offset >> PAGE_SHIFT); offset = offset_in_page(offset); addr = kmap_local_page(page) + offset; } else { /* * When !HIGHMEM we allow the walker to return segments that * span a page boundary; see scatterwalk_clamp(). To make it * clear that in this case we're working in the linear buffer of * the whole sg entry in the kernel's direct map rather than * within the mapped buffer of a single page, compute the * address as an offset from the page_address() of the first * page of the sg entry. Either way the result is the address * in the direct map, but this makes it clearer what is really * going on. */ addr = page_address(base_page) + offset; } walk->__addr = addr; } /** * scatterwalk_next() - Get the next data buffer in a scatterlist walk * @walk: the scatter_walk * @total: the total number of bytes remaining, > 0 * * A virtual address for the next segment of data from the scatterlist will * be placed into @walk->addr. The caller must call scatterwalk_done_src() * or scatterwalk_done_dst() when it is done using this virtual address. * * Returns: the next number of bytes available, <= @total */ static inline unsigned int scatterwalk_next(struct scatter_walk *walk, unsigned int total) { unsigned int nbytes = scatterwalk_clamp(walk, total); scatterwalk_map(walk); return nbytes; } static inline void scatterwalk_unmap(struct scatter_walk *walk) { if (IS_ENABLED(CONFIG_HIGHMEM)) kunmap_local(walk->__addr); } static inline void scatterwalk_advance(struct scatter_walk *walk, unsigned int nbytes) { walk->offset += nbytes; } /** * scatterwalk_done_src() - Finish one step of a walk of source scatterlist * @walk: the scatter_walk * @nbytes: the number of bytes processed this step, less than or equal to the * number of bytes that scatterwalk_next() returned. * * Use this if the mapped address was not written to, i.e. it is source data. */ static inline void scatterwalk_done_src(struct scatter_walk *walk, unsigned int nbytes) { scatterwalk_unmap(walk); scatterwalk_advance(walk, nbytes); } /** * scatterwalk_done_dst() - Finish one step of a walk of destination scatterlist * @walk: the scatter_walk * @nbytes: the number of bytes processed this step, less than or equal to the * number of bytes that scatterwalk_next() returned. * * Use this if the mapped address may have been written to, i.e. it is * destination data. */ static inline void scatterwalk_done_dst(struct scatter_walk *walk, unsigned int nbytes) { scatterwalk_unmap(walk); /* * Explicitly check ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE instead of just * relying on flush_dcache_page() being a no-op when not implemented, * since otherwise the BUG_ON in sg_page() does not get optimized out. * This also avoids having to consider whether the loop would get * reliably optimized out or not. */ if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE) { struct page *base_page; unsigned int offset; int start, end, i; base_page = sg_page(walk->sg); offset = walk->offset; start = offset >> PAGE_SHIFT; end = start + (nbytes >> PAGE_SHIFT); end += (offset_in_page(offset) + offset_in_page(nbytes) + PAGE_SIZE - 1) >> PAGE_SHIFT; for (i = start; i < end; i++) flush_dcache_page(base_page + i); } scatterwalk_advance(walk, nbytes); } void scatterwalk_skip(struct scatter_walk *walk, unsigned int nbytes); void memcpy_from_scatterwalk(void *buf, struct scatter_walk *walk, unsigned int nbytes); void memcpy_to_scatterwalk(struct scatter_walk *walk, const void *buf, unsigned int nbytes); void memcpy_from_sglist(void *buf, struct scatterlist *sg, unsigned int start, unsigned int nbytes); void memcpy_to_sglist(struct scatterlist *sg, unsigned int start, const void *buf, unsigned int nbytes); void memcpy_sglist(struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes); /* In new code, please use memcpy_{from,to}_sglist() directly instead. */ static inline void scatterwalk_map_and_copy(void *buf, struct scatterlist *sg, unsigned int start, unsigned int nbytes, int out) { if (out) memcpy_to_sglist(sg, start, buf, nbytes); else memcpy_from_sglist(buf, sg, start, nbytes); } struct scatterlist *scatterwalk_ffwd(struct scatterlist dst[2], struct scatterlist *src, unsigned int len); int skcipher_walk_first(struct skcipher_walk *walk, bool atomic); int skcipher_walk_done(struct skcipher_walk *walk, int res); static inline void skcipher_walk_abort(struct skcipher_walk *walk) { skcipher_walk_done(walk, -ECANCELED); } #endif /* _CRYPTO_SCATTERWALK_H */ |
| 1184 877 307 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef __LICENSE_H #define __LICENSE_H static inline int license_is_gpl_compatible(const char *license) { return (strcmp(license, "GPL") == 0 || strcmp(license, "GPL v2") == 0 || strcmp(license, "GPL and additional rights") == 0 || strcmp(license, "Dual BSD/GPL") == 0 || strcmp(license, "Dual MIT/GPL") == 0 || strcmp(license, "Dual MPL/GPL") == 0); } #endif |
| 44 25 22 47 3 47 47 4 45 47 3 26 23 53 2 20 33 12 4 39 47 40 20 26 79 1 2 15 4 57 37 13 35 44 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 | // SPDX-License-Identifier: GPL-2.0-or-later /* * linux/mm/process_vm_access.c * * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp. */ #include <linux/compat.h> #include <linux/mm.h> #include <linux/uio.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/highmem.h> #include <linux/ptrace.h> #include <linux/slab.h> #include <linux/syscalls.h> /** * process_vm_rw_pages - read/write pages from task specified * @pages: array of pointers to pages we want to copy * @offset: offset in page to start copying from/to * @len: number of bytes to copy * @iter: where to copy to/from locally * @vm_write: 0 means copy from, 1 means copy to * Returns 0 on success, error code otherwise */ static int process_vm_rw_pages(struct page **pages, unsigned offset, size_t len, struct iov_iter *iter, int vm_write) { /* Do the copy for each page */ while (len && iov_iter_count(iter)) { struct page *page = *pages++; size_t copy = PAGE_SIZE - offset; size_t copied; if (copy > len) copy = len; if (vm_write) copied = copy_page_from_iter(page, offset, copy, iter); else copied = copy_page_to_iter(page, offset, copy, iter); len -= copied; if (copied < copy && iov_iter_count(iter)) return -EFAULT; offset = 0; } return 0; } /* Maximum number of pages kmalloc'd to hold struct page's during copy */ #define PVM_MAX_KMALLOC_PAGES 2 /* Maximum number of pages that can be stored at a time */ #define PVM_MAX_USER_PAGES (PVM_MAX_KMALLOC_PAGES * PAGE_SIZE / sizeof(struct page *)) /** * process_vm_rw_single_vec - read/write pages from task specified * @addr: start memory address of target process * @len: size of area to copy to/from * @iter: where to copy to/from locally * @process_pages: struct pages area that can store at least * nr_pages_to_copy struct page pointers * @mm: mm for task * @task: task to read/write from * @vm_write: 0 means copy from, 1 means copy to * Returns 0 on success or on failure error code */ static int process_vm_rw_single_vec(unsigned long addr, unsigned long len, struct iov_iter *iter, struct page **process_pages, struct mm_struct *mm, struct task_struct *task, int vm_write) { unsigned long pa = addr & PAGE_MASK; unsigned long start_offset = addr - pa; unsigned long nr_pages; ssize_t rc = 0; unsigned int flags = 0; /* Work out address and page range required */ if (len == 0) return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; if (vm_write) flags |= FOLL_WRITE; while (!rc && nr_pages && iov_iter_count(iter)) { int pinned_pages = min_t(unsigned long, nr_pages, PVM_MAX_USER_PAGES); int locked = 1; size_t bytes; /* * Get the pages we're interested in. We must * access remotely because task/mm might not * current/current->mm */ mmap_read_lock(mm); pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages, flags, process_pages, &locked); if (locked) mmap_read_unlock(mm); if (pinned_pages <= 0) return -EFAULT; bytes = pinned_pages * PAGE_SIZE - start_offset; if (bytes > len) bytes = len; rc = process_vm_rw_pages(process_pages, start_offset, bytes, iter, vm_write); len -= bytes; start_offset = 0; nr_pages -= pinned_pages; pa += pinned_pages * PAGE_SIZE; /* If vm_write is set, the pages need to be made dirty: */ unpin_user_pages_dirty_lock(process_pages, pinned_pages, vm_write); } return rc; } /* Maximum number of entries for process pages array which lives on stack */ #define PVM_MAX_PP_ARRAY_COUNT 16 /** * process_vm_rw_core - core of reading/writing pages from task specified * @pid: PID of process to read/write from/to * @iter: where to copy to/from locally * @rvec: iovec array specifying where to copy to/from in the other process * @riovcnt: size of rvec array * @flags: currently unused * @vm_write: 0 if reading from other process, 1 if writing to other process * * Returns the number of bytes read/written or error code. May * return less bytes than expected if an error occurs during the copying * process. */ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, const struct iovec *rvec, unsigned long riovcnt, unsigned long flags, int vm_write) { struct task_struct *task; struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT]; struct page **process_pages = pp_stack; struct mm_struct *mm; unsigned long i; ssize_t rc = 0; unsigned long nr_pages = 0; unsigned long nr_pages_iov; ssize_t iov_len; size_t total_len = iov_iter_count(iter); /* * Work out how many pages of struct pages we're going to need * when eventually calling get_user_pages */ for (i = 0; i < riovcnt; i++) { iov_len = rvec[i].iov_len; if (iov_len > 0) { nr_pages_iov = ((unsigned long)rvec[i].iov_base + iov_len - 1) / PAGE_SIZE - (unsigned long)rvec[i].iov_base / PAGE_SIZE + 1; nr_pages = max(nr_pages, nr_pages_iov); } } if (nr_pages == 0) return 0; if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) { /* For reliability don't try to kmalloc more than 2 pages worth */ process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES * PAGE_SIZE, sizeof(struct page *)*nr_pages), GFP_KERNEL); if (!process_pages) return -ENOMEM; } /* Get process information */ task = find_get_task_by_vpid(pid); if (!task) { rc = -ESRCH; goto free_proc_pages; } mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); if (IS_ERR(mm)) { rc = PTR_ERR(mm); /* * Explicitly map EACCES to EPERM as EPERM is a more * appropriate error code for process_vw_readv/writev */ if (rc == -EACCES) rc = -EPERM; goto put_task_struct; } for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++) rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, iter, process_pages, mm, task, vm_write); /* copied = space before - space after */ total_len -= iov_iter_count(iter); /* If we have managed to copy any data at all then we return the number of bytes copied. Otherwise we return the error code */ if (total_len) rc = total_len; mmput(mm); put_task_struct: put_task_struct(task); free_proc_pages: if (process_pages != pp_stack) kfree(process_pages); return rc; } /** * process_vm_rw - check iovecs before calling core routine * @pid: PID of process to read/write from/to * @lvec: iovec array specifying where to copy to/from locally * @liovcnt: size of lvec array * @rvec: iovec array specifying where to copy to/from in the other process * @riovcnt: size of rvec array * @flags: currently unused * @vm_write: 0 if reading from other process, 1 if writing to other process * * Returns the number of bytes read/written or error code. May * return less bytes than expected if an error occurs during the copying * process. */ static ssize_t process_vm_rw(pid_t pid, const struct iovec __user *lvec, unsigned long liovcnt, const struct iovec __user *rvec, unsigned long riovcnt, unsigned long flags, int vm_write) { struct iovec iovstack_l[UIO_FASTIOV]; struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; struct iovec *iov_r; struct iov_iter iter; ssize_t rc; int dir = vm_write ? ITER_SOURCE : ITER_DEST; if (flags != 0) return -EINVAL; /* Check iovecs */ rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter); if (rc < 0) return rc; if (!iov_iter_count(&iter)) goto free_iov_l; iov_r = iovec_from_user(rvec, riovcnt, UIO_FASTIOV, iovstack_r, in_compat_syscall()); if (IS_ERR(iov_r)) { rc = PTR_ERR(iov_r); goto free_iov_l; } rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); if (iov_r != iovstack_r) kfree(iov_r); free_iov_l: kfree(iov_l); return rc; } SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec, unsigned long, liovcnt, const struct iovec __user *, rvec, unsigned long, riovcnt, unsigned long, flags) { return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); } SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, const struct iovec __user *, lvec, unsigned long, liovcnt, const struct iovec __user *, rvec, unsigned long, riovcnt, unsigned long, flags) { return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); } |
| 28 28 28 28 4 3 10 5 6 11 11 6 8 11 28 23 8 4 3 4 8 3 5 28 23 14 23 28 4 3 1 3 13 13 15 15 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 | // SPDX-License-Identifier: GPL-2.0-or-later /* * The AEGIS-128 Authenticated-Encryption Algorithm * * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com> * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. */ #include <crypto/algapi.h> #include <crypto/internal/aead.h> #include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/err.h> #include <linux/init.h> #include <linux/jump_label.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/scatterlist.h> #include <asm/simd.h> #include "aegis.h" #define AEGIS128_NONCE_SIZE 16 #define AEGIS128_STATE_BLOCKS 5 #define AEGIS128_KEY_SIZE 16 #define AEGIS128_MIN_AUTH_SIZE 8 #define AEGIS128_MAX_AUTH_SIZE 16 struct aegis_state { union aegis_block blocks[AEGIS128_STATE_BLOCKS]; }; struct aegis_ctx { union aegis_block key; }; static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_simd); static const union aegis_block crypto_aegis_const[2] = { { .words64 = { cpu_to_le64(U64_C(0x0d08050302010100)), cpu_to_le64(U64_C(0x6279e99059372215)), } }, { .words64 = { cpu_to_le64(U64_C(0xf12fc26d55183ddb)), cpu_to_le64(U64_C(0xdd28b57342311120)), } }, }; static bool aegis128_do_simd(void) { #ifdef CONFIG_CRYPTO_AEGIS128_SIMD if (static_branch_likely(&have_simd)) return crypto_simd_usable(); #endif return false; } static void crypto_aegis128_update(struct aegis_state *state) { union aegis_block tmp; unsigned int i; tmp = state->blocks[AEGIS128_STATE_BLOCKS - 1]; for (i = AEGIS128_STATE_BLOCKS - 1; i > 0; i--) crypto_aegis_aesenc(&state->blocks[i], &state->blocks[i - 1], &state->blocks[i]); crypto_aegis_aesenc(&state->blocks[0], &tmp, &state->blocks[0]); } static void crypto_aegis128_update_a(struct aegis_state *state, const union aegis_block *msg, bool do_simd) { if (IS_ENABLED(CONFIG_CRYPTO_AEGIS128_SIMD) && do_simd) { crypto_aegis128_update_simd(state, msg); return; } crypto_aegis128_update(state); crypto_aegis_block_xor(&state->blocks[0], msg); } static void crypto_aegis128_update_u(struct aegis_state *state, const void *msg, bool do_simd) { if (IS_ENABLED(CONFIG_CRYPTO_AEGIS128_SIMD) && do_simd) { crypto_aegis128_update_simd(state, msg); return; } crypto_aegis128_update(state); crypto_xor(state->blocks[0].bytes, msg, AEGIS_BLOCK_SIZE); } static void crypto_aegis128_init(struct aegis_state *state, const union aegis_block *key, const u8 *iv) { union aegis_block key_iv; unsigned int i; key_iv = *key; crypto_xor(key_iv.bytes, iv, AEGIS_BLOCK_SIZE); state->blocks[0] = key_iv; state->blocks[1] = crypto_aegis_const[1]; state->blocks[2] = crypto_aegis_const[0]; state->blocks[3] = *key; state->blocks[4] = *key; crypto_aegis_block_xor(&state->blocks[3], &crypto_aegis_const[0]); crypto_aegis_block_xor(&state->blocks[4], &crypto_aegis_const[1]); for (i = 0; i < 5; i++) { crypto_aegis128_update_a(state, key, false); crypto_aegis128_update_a(state, &key_iv, false); } } static void crypto_aegis128_ad(struct aegis_state *state, const u8 *src, unsigned int size, bool do_simd) { if (AEGIS_ALIGNED(src)) { const union aegis_block *src_blk = (const union aegis_block *)src; while (size >= AEGIS_BLOCK_SIZE) { crypto_aegis128_update_a(state, src_blk, do_simd); size -= AEGIS_BLOCK_SIZE; src_blk++; } } else { while (size >= AEGIS_BLOCK_SIZE) { crypto_aegis128_update_u(state, src, do_simd); size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; } } } static void crypto_aegis128_wipe_chunk(struct aegis_state *state, u8 *dst, const u8 *src, unsigned int size) { memzero_explicit(dst, size); } static void crypto_aegis128_encrypt_chunk(struct aegis_state *state, u8 *dst, const u8 *src, unsigned int size) { union aegis_block tmp; if (AEGIS_ALIGNED(src) && AEGIS_ALIGNED(dst)) { while (size >= AEGIS_BLOCK_SIZE) { union aegis_block *dst_blk = (union aegis_block *)dst; const union aegis_block *src_blk = (const union aegis_block *)src; tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_aegis_block_xor(&tmp, src_blk); crypto_aegis128_update_a(state, src_blk, false); *dst_blk = tmp; size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; dst += AEGIS_BLOCK_SIZE; } } else { while (size >= AEGIS_BLOCK_SIZE) { tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_xor(tmp.bytes, src, AEGIS_BLOCK_SIZE); crypto_aegis128_update_u(state, src, false); memcpy(dst, tmp.bytes, AEGIS_BLOCK_SIZE); size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; dst += AEGIS_BLOCK_SIZE; } } if (size > 0) { union aegis_block msg = {}; memcpy(msg.bytes, src, size); tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_aegis128_update_a(state, &msg, false); crypto_aegis_block_xor(&msg, &tmp); memcpy(dst, msg.bytes, size); } } static void crypto_aegis128_decrypt_chunk(struct aegis_state *state, u8 *dst, const u8 *src, unsigned int size) { union aegis_block tmp; if (AEGIS_ALIGNED(src) && AEGIS_ALIGNED(dst)) { while (size >= AEGIS_BLOCK_SIZE) { union aegis_block *dst_blk = (union aegis_block *)dst; const union aegis_block *src_blk = (const union aegis_block *)src; tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_aegis_block_xor(&tmp, src_blk); crypto_aegis128_update_a(state, &tmp, false); *dst_blk = tmp; size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; dst += AEGIS_BLOCK_SIZE; } } else { while (size >= AEGIS_BLOCK_SIZE) { tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_xor(tmp.bytes, src, AEGIS_BLOCK_SIZE); crypto_aegis128_update_a(state, &tmp, false); memcpy(dst, tmp.bytes, AEGIS_BLOCK_SIZE); size -= AEGIS_BLOCK_SIZE; src += AEGIS_BLOCK_SIZE; dst += AEGIS_BLOCK_SIZE; } } if (size > 0) { union aegis_block msg = {}; memcpy(msg.bytes, src, size); tmp = state->blocks[2]; crypto_aegis_block_and(&tmp, &state->blocks[3]); crypto_aegis_block_xor(&tmp, &state->blocks[4]); crypto_aegis_block_xor(&tmp, &state->blocks[1]); crypto_aegis_block_xor(&msg, &tmp); memset(msg.bytes + size, 0, AEGIS_BLOCK_SIZE - size); crypto_aegis128_update_a(state, &msg, false); memcpy(dst, msg.bytes, size); } } static void crypto_aegis128_process_ad(struct aegis_state *state, struct scatterlist *sg_src, unsigned int assoclen, bool do_simd) { struct scatter_walk walk; union aegis_block buf; unsigned int pos = 0; scatterwalk_start(&walk, sg_src); while (assoclen != 0) { unsigned int size = scatterwalk_next(&walk, assoclen); const u8 *src = walk.addr; unsigned int left = size; if (pos + size >= AEGIS_BLOCK_SIZE) { if (pos > 0) { unsigned int fill = AEGIS_BLOCK_SIZE - pos; memcpy(buf.bytes + pos, src, fill); crypto_aegis128_update_a(state, &buf, do_simd); pos = 0; left -= fill; src += fill; } crypto_aegis128_ad(state, src, left, do_simd); src += left & ~(AEGIS_BLOCK_SIZE - 1); left &= AEGIS_BLOCK_SIZE - 1; } memcpy(buf.bytes + pos, src, left); pos += left; assoclen -= size; scatterwalk_done_src(&walk, size); } if (pos > 0) { memset(buf.bytes + pos, 0, AEGIS_BLOCK_SIZE - pos); crypto_aegis128_update_a(state, &buf, do_simd); } } static __always_inline int crypto_aegis128_process_crypt(struct aegis_state *state, struct skcipher_walk *walk, void (*crypt)(struct aegis_state *state, u8 *dst, const u8 *src, unsigned int size)) { int err = 0; while (walk->nbytes) { unsigned int nbytes = walk->nbytes; if (nbytes < walk->total) nbytes = round_down(nbytes, walk->stride); crypt(state, walk->dst.virt.addr, walk->src.virt.addr, nbytes); err = skcipher_walk_done(walk, walk->nbytes - nbytes); } return err; } static void crypto_aegis128_final(struct aegis_state *state, union aegis_block *tag_xor, u64 assoclen, u64 cryptlen) { u64 assocbits = assoclen * 8; u64 cryptbits = cryptlen * 8; union aegis_block tmp; unsigned int i; tmp.words64[0] = cpu_to_le64(assocbits); tmp.words64[1] = cpu_to_le64(cryptbits); crypto_aegis_block_xor(&tmp, &state->blocks[3]); for (i = 0; i < 7; i++) crypto_aegis128_update_a(state, &tmp, false); for (i = 0; i < AEGIS128_STATE_BLOCKS; i++) crypto_aegis_block_xor(tag_xor, &state->blocks[i]); } static int crypto_aegis128_setkey(struct crypto_aead *aead, const u8 *key, unsigned int keylen) { struct aegis_ctx *ctx = crypto_aead_ctx(aead); if (keylen != AEGIS128_KEY_SIZE) return -EINVAL; memcpy(ctx->key.bytes, key, AEGIS128_KEY_SIZE); return 0; } static int crypto_aegis128_setauthsize(struct crypto_aead *tfm, unsigned int authsize) { if (authsize > AEGIS128_MAX_AUTH_SIZE) return -EINVAL; if (authsize < AEGIS128_MIN_AUTH_SIZE) return -EINVAL; return 0; } static int crypto_aegis128_encrypt_generic(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); union aegis_block tag = {}; unsigned int authsize = crypto_aead_authsize(tfm); struct aegis_ctx *ctx = crypto_aead_ctx(tfm); unsigned int cryptlen = req->cryptlen; struct skcipher_walk walk; struct aegis_state state; skcipher_walk_aead_encrypt(&walk, req, false); crypto_aegis128_init(&state, &ctx->key, req->iv); crypto_aegis128_process_ad(&state, req->src, req->assoclen, false); crypto_aegis128_process_crypt(&state, &walk, crypto_aegis128_encrypt_chunk); crypto_aegis128_final(&state, &tag, req->assoclen, cryptlen); scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen, authsize, 1); return 0; } static int crypto_aegis128_decrypt_generic(struct aead_request *req) { static const u8 zeros[AEGIS128_MAX_AUTH_SIZE] = {}; struct crypto_aead *tfm = crypto_aead_reqtfm(req); union aegis_block tag; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen - authsize; struct aegis_ctx *ctx = crypto_aead_ctx(tfm); struct skcipher_walk walk; struct aegis_state state; scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen, authsize, 0); skcipher_walk_aead_decrypt(&walk, req, false); crypto_aegis128_init(&state, &ctx->key, req->iv); crypto_aegis128_process_ad(&state, req->src, req->assoclen, false); crypto_aegis128_process_crypt(&state, &walk, crypto_aegis128_decrypt_chunk); crypto_aegis128_final(&state, &tag, req->assoclen, cryptlen); if (unlikely(crypto_memneq(tag.bytes, zeros, authsize))) { /* * From Chapter 4. 'Security Analysis' of the AEGIS spec [0] * * "3. If verification fails, the decrypted plaintext and the * wrong authentication tag should not be given as output." * * [0] https://competitions.cr.yp.to/round3/aegisv11.pdf */ skcipher_walk_aead_decrypt(&walk, req, false); crypto_aegis128_process_crypt(NULL, &walk, crypto_aegis128_wipe_chunk); memzero_explicit(&tag, sizeof(tag)); return -EBADMSG; } return 0; } static int crypto_aegis128_encrypt_simd(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); union aegis_block tag = {}; unsigned int authsize = crypto_aead_authsize(tfm); struct aegis_ctx *ctx = crypto_aead_ctx(tfm); unsigned int cryptlen = req->cryptlen; struct skcipher_walk walk; struct aegis_state state; if (!aegis128_do_simd()) return crypto_aegis128_encrypt_generic(req); skcipher_walk_aead_encrypt(&walk, req, false); crypto_aegis128_init_simd(&state, &ctx->key, req->iv); crypto_aegis128_process_ad(&state, req->src, req->assoclen, true); crypto_aegis128_process_crypt(&state, &walk, crypto_aegis128_encrypt_chunk_simd); crypto_aegis128_final_simd(&state, &tag, req->assoclen, cryptlen, 0); scatterwalk_map_and_copy(tag.bytes, req->dst, req->assoclen + cryptlen, authsize, 1); return 0; } static int crypto_aegis128_decrypt_simd(struct aead_request *req) { struct crypto_aead *tfm = crypto_aead_reqtfm(req); union aegis_block tag; unsigned int authsize = crypto_aead_authsize(tfm); unsigned int cryptlen = req->cryptlen - authsize; struct aegis_ctx *ctx = crypto_aead_ctx(tfm); struct skcipher_walk walk; struct aegis_state state; if (!aegis128_do_simd()) return crypto_aegis128_decrypt_generic(req); scatterwalk_map_and_copy(tag.bytes, req->src, req->assoclen + cryptlen, authsize, 0); skcipher_walk_aead_decrypt(&walk, req, false); crypto_aegis128_init_simd(&state, &ctx->key, req->iv); crypto_aegis128_process_ad(&state, req->src, req->assoclen, true); crypto_aegis128_process_crypt(&state, &walk, crypto_aegis128_decrypt_chunk_simd); if (unlikely(crypto_aegis128_final_simd(&state, &tag, req->assoclen, cryptlen, authsize))) { skcipher_walk_aead_decrypt(&walk, req, false); crypto_aegis128_process_crypt(NULL, &walk, crypto_aegis128_wipe_chunk); return -EBADMSG; } return 0; } static struct aead_alg crypto_aegis128_alg_generic = { .setkey = crypto_aegis128_setkey, .setauthsize = crypto_aegis128_setauthsize, .encrypt = crypto_aegis128_encrypt_generic, .decrypt = crypto_aegis128_decrypt_generic, .ivsize = AEGIS128_NONCE_SIZE, .maxauthsize = AEGIS128_MAX_AUTH_SIZE, .chunksize = AEGIS_BLOCK_SIZE, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aegis_ctx), .base.cra_priority = 100, .base.cra_name = "aegis128", .base.cra_driver_name = "aegis128-generic", .base.cra_module = THIS_MODULE, }; static struct aead_alg crypto_aegis128_alg_simd = { .setkey = crypto_aegis128_setkey, .setauthsize = crypto_aegis128_setauthsize, .encrypt = crypto_aegis128_encrypt_simd, .decrypt = crypto_aegis128_decrypt_simd, .ivsize = AEGIS128_NONCE_SIZE, .maxauthsize = AEGIS128_MAX_AUTH_SIZE, .chunksize = AEGIS_BLOCK_SIZE, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct aegis_ctx), .base.cra_priority = 200, .base.cra_name = "aegis128", .base.cra_driver_name = "aegis128-simd", .base.cra_module = THIS_MODULE, }; static int __init crypto_aegis128_module_init(void) { int ret; ret = crypto_register_aead(&crypto_aegis128_alg_generic); if (ret) return ret; if (IS_ENABLED(CONFIG_CRYPTO_AEGIS128_SIMD) && crypto_aegis128_have_simd()) { ret = crypto_register_aead(&crypto_aegis128_alg_simd); if (ret) { crypto_unregister_aead(&crypto_aegis128_alg_generic); return ret; } static_branch_enable(&have_simd); } return 0; } static void __exit crypto_aegis128_module_exit(void) { if (IS_ENABLED(CONFIG_CRYPTO_AEGIS128_SIMD) && crypto_aegis128_have_simd()) crypto_unregister_aead(&crypto_aegis128_alg_simd); crypto_unregister_aead(&crypto_aegis128_alg_generic); } module_init(crypto_aegis128_module_init); module_exit(crypto_aegis128_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm"); MODULE_ALIAS_CRYPTO("aegis128"); MODULE_ALIAS_CRYPTO("aegis128-generic"); MODULE_ALIAS_CRYPTO("aegis128-simd"); |
| 7 360 1704 1081 776 264 783 257 550 1536 1694 1599 183 11 1690 6 352 2 1239 354 1 1 352 1 338 4 3 121 2 105 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 | // SPDX-License-Identifier: GPL-2.0-or-later /* * SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions * * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2014 Red Hat Inc. * Copyright 2025 Google LLC */ #include <crypto/hmac.h> #include <crypto/sha2.h> #include <linux/export.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/string.h> #include <linux/unaligned.h> #include <linux/wordpart.h> static const struct sha256_block_state sha224_iv = { .h = { SHA224_H0, SHA224_H1, SHA224_H2, SHA224_H3, SHA224_H4, SHA224_H5, SHA224_H6, SHA224_H7, }, }; static const struct sha256_ctx initial_sha256_ctx = { .ctx = { .state = { .h = { SHA256_H0, SHA256_H1, SHA256_H2, SHA256_H3, SHA256_H4, SHA256_H5, SHA256_H6, SHA256_H7, }, }, .bytecount = 0, }, }; #define sha256_iv (initial_sha256_ctx.ctx.state) static const u32 sha256_K[64] = { 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, }; #define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) #define Maj(x, y, z) (((x) & (y)) | ((z) & ((x) | (y)))) #define e0(x) (ror32((x), 2) ^ ror32((x), 13) ^ ror32((x), 22)) #define e1(x) (ror32((x), 6) ^ ror32((x), 11) ^ ror32((x), 25)) #define s0(x) (ror32((x), 7) ^ ror32((x), 18) ^ ((x) >> 3)) #define s1(x) (ror32((x), 17) ^ ror32((x), 19) ^ ((x) >> 10)) static inline void LOAD_OP(int I, u32 *W, const u8 *input) { W[I] = get_unaligned_be32((__u32 *)input + I); } static inline void BLEND_OP(int I, u32 *W) { W[I] = s1(W[I - 2]) + W[I - 7] + s0(W[I - 15]) + W[I - 16]; } #define SHA256_ROUND(i, a, b, c, d, e, f, g, h) \ do { \ u32 t1, t2; \ t1 = h + e1(e) + Ch(e, f, g) + sha256_K[i] + W[i]; \ t2 = e0(a) + Maj(a, b, c); \ d += t1; \ h = t1 + t2; \ } while (0) static void sha256_block_generic(struct sha256_block_state *state, const u8 *input, u32 W[64]) { u32 a, b, c, d, e, f, g, h; int i; /* load the input */ for (i = 0; i < 16; i += 8) { LOAD_OP(i + 0, W, input); LOAD_OP(i + 1, W, input); LOAD_OP(i + 2, W, input); LOAD_OP(i + 3, W, input); LOAD_OP(i + 4, W, input); LOAD_OP(i + 5, W, input); LOAD_OP(i + 6, W, input); LOAD_OP(i + 7, W, input); } /* now blend */ for (i = 16; i < 64; i += 8) { BLEND_OP(i + 0, W); BLEND_OP(i + 1, W); BLEND_OP(i + 2, W); BLEND_OP(i + 3, W); BLEND_OP(i + 4, W); BLEND_OP(i + 5, W); BLEND_OP(i + 6, W); BLEND_OP(i + 7, W); } /* load the state into our registers */ a = state->h[0]; b = state->h[1]; c = state->h[2]; d = state->h[3]; e = state->h[4]; f = state->h[5]; g = state->h[6]; h = state->h[7]; /* now iterate */ for (i = 0; i < 64; i += 8) { SHA256_ROUND(i + 0, a, b, c, d, e, f, g, h); SHA256_ROUND(i + 1, h, a, b, c, d, e, f, g); SHA256_ROUND(i + 2, g, h, a, b, c, d, e, f); SHA256_ROUND(i + 3, f, g, h, a, b, c, d, e); SHA256_ROUND(i + 4, e, f, g, h, a, b, c, d); SHA256_ROUND(i + 5, d, e, f, g, h, a, b, c); SHA256_ROUND(i + 6, c, d, e, f, g, h, a, b); SHA256_ROUND(i + 7, b, c, d, e, f, g, h, a); } state->h[0] += a; state->h[1] += b; state->h[2] += c; state->h[3] += d; state->h[4] += e; state->h[5] += f; state->h[6] += g; state->h[7] += h; } static void __maybe_unused sha256_blocks_generic(struct sha256_block_state *state, const u8 *data, size_t nblocks) { u32 W[64]; do { sha256_block_generic(state, data, W); data += SHA256_BLOCK_SIZE; } while (--nblocks); memzero_explicit(W, sizeof(W)); } #if defined(CONFIG_CRYPTO_LIB_SHA256_ARCH) && !defined(__DISABLE_EXPORTS) #include "sha256.h" /* $(SRCARCH)/sha256.h */ #else #define sha256_blocks sha256_blocks_generic #endif static void __sha256_init(struct __sha256_ctx *ctx, const struct sha256_block_state *iv, u64 initial_bytecount) { ctx->state = *iv; ctx->bytecount = initial_bytecount; } void sha224_init(struct sha224_ctx *ctx) { __sha256_init(&ctx->ctx, &sha224_iv, 0); } EXPORT_SYMBOL_GPL(sha224_init); void sha256_init(struct sha256_ctx *ctx) { __sha256_init(&ctx->ctx, &sha256_iv, 0); } EXPORT_SYMBOL_GPL(sha256_init); void __sha256_update(struct __sha256_ctx *ctx, const u8 *data, size_t len) { size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE; ctx->bytecount += len; if (partial + len >= SHA256_BLOCK_SIZE) { size_t nblocks; if (partial) { size_t l = SHA256_BLOCK_SIZE - partial; memcpy(&ctx->buf[partial], data, l); data += l; len -= l; sha256_blocks(&ctx->state, ctx->buf, 1); } nblocks = len / SHA256_BLOCK_SIZE; len %= SHA256_BLOCK_SIZE; if (nblocks) { sha256_blocks(&ctx->state, data, nblocks); data += nblocks * SHA256_BLOCK_SIZE; } partial = 0; } if (len) memcpy(&ctx->buf[partial], data, len); } EXPORT_SYMBOL(__sha256_update); static void __sha256_final(struct __sha256_ctx *ctx, u8 *out, size_t digest_size) { u64 bitcount = ctx->bytecount << 3; size_t partial = ctx->bytecount % SHA256_BLOCK_SIZE; ctx->buf[partial++] = 0x80; if (partial > SHA256_BLOCK_SIZE - 8) { memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - partial); sha256_blocks(&ctx->state, ctx->buf, 1); partial = 0; } memset(&ctx->buf[partial], 0, SHA256_BLOCK_SIZE - 8 - partial); *(__be64 *)&ctx->buf[SHA256_BLOCK_SIZE - 8] = cpu_to_be64(bitcount); sha256_blocks(&ctx->state, ctx->buf, 1); for (size_t i = 0; i < digest_size; i += 4) put_unaligned_be32(ctx->state.h[i / 4], out + i); } void sha224_final(struct sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]) { __sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE); memzero_explicit(ctx, sizeof(*ctx)); } EXPORT_SYMBOL(sha224_final); void sha256_final(struct sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]) { __sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE); memzero_explicit(ctx, sizeof(*ctx)); } EXPORT_SYMBOL(sha256_final); void sha224(const u8 *data, size_t len, u8 out[SHA224_DIGEST_SIZE]) { struct sha224_ctx ctx; sha224_init(&ctx); sha224_update(&ctx, data, len); sha224_final(&ctx, out); } EXPORT_SYMBOL(sha224); void sha256(const u8 *data, size_t len, u8 out[SHA256_DIGEST_SIZE]) { struct sha256_ctx ctx; sha256_init(&ctx); sha256_update(&ctx, data, len); sha256_final(&ctx, out); } EXPORT_SYMBOL(sha256); /* * Pre-boot environment (as indicated by __DISABLE_EXPORTS being defined) * doesn't need either HMAC support or interleaved hashing support */ #ifndef __DISABLE_EXPORTS #ifndef sha256_finup_2x_arch static bool sha256_finup_2x_arch(const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]) { return false; } static bool sha256_finup_2x_is_optimized_arch(void) { return false; } #endif /* Sequential fallback implementation of sha256_finup_2x() */ static noinline_for_stack void sha256_finup_2x_sequential( const struct __sha256_ctx *ctx, const u8 *data1, const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]) { struct __sha256_ctx mut_ctx; mut_ctx = *ctx; __sha256_update(&mut_ctx, data1, len); __sha256_final(&mut_ctx, out1, SHA256_DIGEST_SIZE); mut_ctx = *ctx; __sha256_update(&mut_ctx, data2, len); __sha256_final(&mut_ctx, out2, SHA256_DIGEST_SIZE); } void sha256_finup_2x(const struct sha256_ctx *ctx, const u8 *data1, const u8 *data2, size_t len, u8 out1[SHA256_DIGEST_SIZE], u8 out2[SHA256_DIGEST_SIZE]) { if (ctx == NULL) ctx = &initial_sha256_ctx; if (likely(sha256_finup_2x_arch(&ctx->ctx, data1, data2, len, out1, out2))) return; sha256_finup_2x_sequential(&ctx->ctx, data1, data2, len, out1, out2); } EXPORT_SYMBOL_GPL(sha256_finup_2x); bool sha256_finup_2x_is_optimized(void) { return sha256_finup_2x_is_optimized_arch(); } EXPORT_SYMBOL_GPL(sha256_finup_2x_is_optimized); static void __hmac_sha256_preparekey(struct sha256_block_state *istate, struct sha256_block_state *ostate, const u8 *raw_key, size_t raw_key_len, const struct sha256_block_state *iv) { union { u8 b[SHA256_BLOCK_SIZE]; unsigned long w[SHA256_BLOCK_SIZE / sizeof(unsigned long)]; } derived_key = { 0 }; if (unlikely(raw_key_len > SHA256_BLOCK_SIZE)) { if (iv == &sha224_iv) sha224(raw_key, raw_key_len, derived_key.b); else sha256(raw_key, raw_key_len, derived_key.b); } else { memcpy(derived_key.b, raw_key, raw_key_len); } for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) derived_key.w[i] ^= REPEAT_BYTE(HMAC_IPAD_VALUE); *istate = *iv; sha256_blocks(istate, derived_key.b, 1); for (size_t i = 0; i < ARRAY_SIZE(derived_key.w); i++) derived_key.w[i] ^= REPEAT_BYTE(HMAC_OPAD_VALUE ^ HMAC_IPAD_VALUE); *ostate = *iv; sha256_blocks(ostate, derived_key.b, 1); memzero_explicit(&derived_key, sizeof(derived_key)); } void hmac_sha224_preparekey(struct hmac_sha224_key *key, const u8 *raw_key, size_t raw_key_len) { __hmac_sha256_preparekey(&key->key.istate, &key->key.ostate, raw_key, raw_key_len, &sha224_iv); } EXPORT_SYMBOL_GPL(hmac_sha224_preparekey); void hmac_sha256_preparekey(struct hmac_sha256_key *key, const u8 *raw_key, size_t raw_key_len) { __hmac_sha256_preparekey(&key->key.istate, &key->key.ostate, raw_key, raw_key_len, &sha256_iv); } EXPORT_SYMBOL_GPL(hmac_sha256_preparekey); void __hmac_sha256_init(struct __hmac_sha256_ctx *ctx, const struct __hmac_sha256_key *key) { __sha256_init(&ctx->sha_ctx, &key->istate, SHA256_BLOCK_SIZE); ctx->ostate = key->ostate; } EXPORT_SYMBOL_GPL(__hmac_sha256_init); void hmac_sha224_init_usingrawkey(struct hmac_sha224_ctx *ctx, const u8 *raw_key, size_t raw_key_len) { __hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate, raw_key, raw_key_len, &sha224_iv); ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE; } EXPORT_SYMBOL_GPL(hmac_sha224_init_usingrawkey); void hmac_sha256_init_usingrawkey(struct hmac_sha256_ctx *ctx, const u8 *raw_key, size_t raw_key_len) { __hmac_sha256_preparekey(&ctx->ctx.sha_ctx.state, &ctx->ctx.ostate, raw_key, raw_key_len, &sha256_iv); ctx->ctx.sha_ctx.bytecount = SHA256_BLOCK_SIZE; } EXPORT_SYMBOL_GPL(hmac_sha256_init_usingrawkey); static void __hmac_sha256_final(struct __hmac_sha256_ctx *ctx, u8 *out, size_t digest_size) { /* Generate the padded input for the outer hash in ctx->sha_ctx.buf. */ __sha256_final(&ctx->sha_ctx, ctx->sha_ctx.buf, digest_size); memset(&ctx->sha_ctx.buf[digest_size], 0, SHA256_BLOCK_SIZE - digest_size); ctx->sha_ctx.buf[digest_size] = 0x80; *(__be32 *)&ctx->sha_ctx.buf[SHA256_BLOCK_SIZE - 4] = cpu_to_be32(8 * (SHA256_BLOCK_SIZE + digest_size)); /* Compute the outer hash, which gives the HMAC value. */ sha256_blocks(&ctx->ostate, ctx->sha_ctx.buf, 1); for (size_t i = 0; i < digest_size; i += 4) put_unaligned_be32(ctx->ostate.h[i / 4], out + i); memzero_explicit(ctx, sizeof(*ctx)); } void hmac_sha224_final(struct hmac_sha224_ctx *ctx, u8 out[SHA224_DIGEST_SIZE]) { __hmac_sha256_final(&ctx->ctx, out, SHA224_DIGEST_SIZE); } EXPORT_SYMBOL_GPL(hmac_sha224_final); void hmac_sha256_final(struct hmac_sha256_ctx *ctx, u8 out[SHA256_DIGEST_SIZE]) { __hmac_sha256_final(&ctx->ctx, out, SHA256_DIGEST_SIZE); } EXPORT_SYMBOL_GPL(hmac_sha256_final); void hmac_sha224(const struct hmac_sha224_key *key, const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE]) { struct hmac_sha224_ctx ctx; hmac_sha224_init(&ctx, key); hmac_sha224_update(&ctx, data, data_len); hmac_sha224_final(&ctx, out); } EXPORT_SYMBOL_GPL(hmac_sha224); void hmac_sha256(const struct hmac_sha256_key *key, const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE]) { struct hmac_sha256_ctx ctx; hmac_sha256_init(&ctx, key); hmac_sha256_update(&ctx, data, data_len); hmac_sha256_final(&ctx, out); } EXPORT_SYMBOL_GPL(hmac_sha256); void hmac_sha224_usingrawkey(const u8 *raw_key, size_t raw_key_len, const u8 *data, size_t data_len, u8 out[SHA224_DIGEST_SIZE]) { struct hmac_sha224_ctx ctx; hmac_sha224_init_usingrawkey(&ctx, raw_key, raw_key_len); hmac_sha224_update(&ctx, data, data_len); hmac_sha224_final(&ctx, out); } EXPORT_SYMBOL_GPL(hmac_sha224_usingrawkey); void hmac_sha256_usingrawkey(const u8 *raw_key, size_t raw_key_len, const u8 *data, size_t data_len, u8 out[SHA256_DIGEST_SIZE]) { struct hmac_sha256_ctx ctx; hmac_sha256_init_usingrawkey(&ctx, raw_key, raw_key_len); hmac_sha256_update(&ctx, data, data_len); hmac_sha256_final(&ctx, out); } EXPORT_SYMBOL_GPL(hmac_sha256_usingrawkey); #endif /* !__DISABLE_EXPORTS */ #ifdef sha256_mod_init_arch static int __init sha256_mod_init(void) { sha256_mod_init_arch(); return 0; } subsys_initcall(sha256_mod_init); static void __exit sha256_mod_exit(void) { } module_exit(sha256_mod_exit); #endif MODULE_DESCRIPTION("SHA-224, SHA-256, HMAC-SHA224, and HMAC-SHA256 library functions"); MODULE_LICENSE("GPL"); |
| 32300 4001 19970 1175 32233 19960 32 566 280 266 497 1030 19 51 1147 60 263 424 150 2013 3979 6019 4104 7 1974 5390 282 62 231 199 282 1316 1312 7 7 47 2458 3970 7835 5670 10968 7 364 889 9 103 1644 16 551 262 103 26 551 103 103 273 37 583 171 3743 30960 168 136 186 6207 194 178 136 170 178 171 2392 2394 65 298 5516 5529 5413 742 744 744 20 20 20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Macros for manipulating and testing page->flags */ #ifndef PAGE_FLAGS_H #define PAGE_FLAGS_H #include <linux/types.h> #include <linux/bug.h> #include <linux/mmdebug.h> #ifndef __GENERATING_BOUNDS_H #include <linux/mm_types.h> #include <generated/bounds.h> #endif /* !__GENERATING_BOUNDS_H */ /* * Various page->flags bits: * * PG_reserved is set for special pages. The "struct page" of such a page * should in general not be touched (e.g. set dirty) except by its owner. * Pages marked as PG_reserved include: * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS, * initrd, HW tables) * - Pages reserved or allocated early during boot (before the page allocator * was initialized). This includes (depending on the architecture) the * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much * much more. Once (if ever) freed, PG_reserved is cleared and they will * be given to the page allocator. * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying * to read/write these pages might end badly. Don't touch! * - The zero page(s) * - Pages allocated in the context of kexec/kdump (loaded kernel image, * control pages, vmcoreinfo) * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are * not marked PG_reserved (as they might be in use by somebody else who does * not respect the caching strategy). * - MCA pages on ia64 * - Pages holding CPU notes for POWER Firmware Assisted Dump * - Device memory (e.g. PMEM, DAX, HMM) * Some PG_reserved pages will be excluded from the hibernation image. * PG_reserved does in general not hinder anybody from dumping or swapping * and is no longer required for remap_pfn_range(). ioremap might require it. * Consequently, PG_reserved for a page mapped into user space can indicate * the zero page, the vDSO, MMIO pages or device memory. * * The PG_private bitflag is set on pagecache pages if they contain filesystem * specific data (which is normally at page->private). It can be used by * private allocations for its own usage. * * During initiation of disk I/O, PG_locked is set. This bit is set before I/O * and cleared when writeback _starts_ or when read _completes_. PG_writeback * is set before writeback starts and cleared when it finishes. * * PG_locked also pins a page in pagecache, and blocks truncation of the file * while it is held. * * page_waitqueue(page) is a wait queue of all tasks waiting for the page * to become unlocked. * * PG_swapbacked is set when a page uses swap as a backing storage. This are * usually PageAnon or shmem pages but please note that even anonymous pages * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as * a result of MADV_FREE). * * PG_referenced, PG_reclaim are used for page reclaim for anonymous and * file-backed pagecache (see mm/vmscan.c). * * PG_arch_1 is an architecture specific page state bit. The generic code * guarantees that this bit is cleared for a page when it first is entered into * the page cache. * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! */ /* * Don't use the pageflags directly. Use the PageFoo macros. * * The page flags field is split into two parts, the main flags area * which extends from the low bits upwards, and the fields area which * extends from the high bits downwards. * * | FIELD | ... | FLAGS | * N-1 ^ 0 * (NR_PAGEFLAGS) * * The fields area is reserved for fields mapping zone, node (for NUMA) and * SPARSEMEM section (for variants of SPARSEMEM that require section ids like * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP). */ enum pageflags { PG_locked, /* Page is locked. Don't touch. */ PG_writeback, /* Page is under writeback */ PG_referenced, PG_uptodate, PG_dirty, PG_lru, PG_head, /* Must be in bit 6 */ PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_active, PG_workingset, PG_owner_priv_1, /* Owner use. If pagecache, fs may use */ PG_owner_2, /* Owner use. If pagecache, fs may use */ PG_arch_1, PG_reserved, PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ PG_dropbehind, /* drop pages on IO completion */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT) PG_young, PG_idle, #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_2 PG_arch_2, #endif #ifdef CONFIG_ARCH_USES_PG_ARCH_3 PG_arch_3, #endif __NR_PAGEFLAGS, PG_readahead = PG_reclaim, /* Anonymous memory (and shmem) */ PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ /* Some filesystems */ PG_checked = PG_owner_priv_1, /* * Depending on the way an anonymous folio can be mapped into a page * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped * THP), PG_anon_exclusive may be set only for the head page or for * tail pages of an anonymous folio. For now, we only expect it to be * set on tail pages for PTE-mapped THP. */ PG_anon_exclusive = PG_owner_2, /* * Set if all buffer heads in the folio are mapped. * Filesystems which do not use BHs can use it for their own purpose. */ PG_mappedtodisk = PG_owner_2, /* Two page bits are conscripted by FS-Cache to maintain local caching * state. These bits are set on pages belonging to the netfs's inodes * when those inodes are being locally cached. */ PG_fscache = PG_private_2, /* page backed by cache */ /* XEN */ /* Pinned in Xen as a read-only pagetable page. */ PG_pinned = PG_owner_priv_1, /* Pinned as part of domain save (see xen_mm_pin_all()). */ PG_savepinned = PG_dirty, /* Has a grant mapping of another (foreign) domain's page. */ PG_foreign = PG_owner_priv_1, /* Remapped by swiotlb-xen. */ PG_xen_remapped = PG_owner_priv_1, #ifdef CONFIG_MIGRATION /* movable_ops page that is isolated for migration */ PG_movable_ops_isolated = PG_reclaim, /* this is a movable_ops page (for selected typed pages only) */ PG_movable_ops = PG_uptodate, #endif /* Only valid for buddy pages. Used to track pages that are reported */ PG_reported = PG_uptodate, #ifdef CONFIG_MEMORY_HOTPLUG /* For self-hosted memmap pages */ PG_vmemmap_self_hosted = PG_owner_priv_1, #endif /* * Flags only valid for compound pages. Stored in first tail page's * flags word. Cannot use the first 8 flags or any flag marked as * PF_ANY. */ /* At least one page in this folio has the hwpoison flag set */ PG_has_hwpoisoned = PG_active, PG_large_rmappable = PG_workingset, /* anon or file-backed */ PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */ }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) #ifndef __GENERATING_BOUNDS_H #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); /* * Return the real head page struct iff the @page is a fake head page, otherwise * return the @page itself. See Documentation/mm/vmemmap_dedup.rst. */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return page; /* * Only addresses aligned with PAGE_SIZE of struct page may be fake head * struct page. The alignment check aims to avoid access the fields ( * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly) * cold cacheline in some cases. */ if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) && test_bit(PG_head, &page->flags.f)) { /* * We can safely access the field of the @page[1] with PG_head * because the @page is a compound page composed with at least * two contiguous pages. */ unsigned long head = READ_ONCE(page[1].compound_head); if (likely(head & 1)) return (const struct page *)(head - 1); } return page; } static __always_inline bool page_count_writable(const struct page *page, int u) { if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return true; /* * The refcount check is ordered before the fake-head check to prevent * the following race: * CPU 1 (HVO) CPU 2 (speculative PFN walker) * * page_ref_freeze() * synchronize_rcu() * rcu_read_lock() * page_is_fake_head() is false * vmemmap_remap_pte() * XXX: struct page[] becomes r/o * * page_ref_unfreeze() * page_ref_count() is not zero * * atomic_add_unless(&page->_refcount) * XXX: try to modify r/o struct page[] * * The refcount check also prevents modification attempts to other (r/o) * tail pages that are not fake heads. */ if (atomic_read_acquire(&page->_refcount) == u) return false; return page_fixed_fake_head(page) == page; } #else static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } static inline bool page_count_writable(const struct page *page, int u) { return true; } #endif static __always_inline int page_is_fake_head(const struct page *page) { return page_fixed_fake_head(page) != page; } static __always_inline unsigned long _compound_head(const struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) return head - 1; return (unsigned long)page_fixed_fake_head(page); } #define compound_head(page) ((typeof(page))_compound_head(page)) /** * page_folio - Converts from page to folio. * @p: The page. * * Every page is part of a folio. This function cannot be called on a * NULL pointer. * * Context: No reference, nor lock is required on @page. If the caller * does not hold a reference, this call may race with a folio split, so * it should re-check the folio still contains this page after gaining * a reference on the folio. * Return: The folio which contains this page. */ #define page_folio(p) (_Generic((p), \ const struct page *: (const struct folio *)_compound_head(p), \ struct page *: (struct folio *)_compound_head(p))) /** * folio_page - Return a page from a folio. * @folio: The folio. * @n: The page number to return. * * @n is relative to the start of the folio. This function does not * check that the page number lies within @folio; the caller is presumed * to have a reference to the page. */ #define folio_page(folio, n) (&(folio)->page + (n)) static __always_inline int PageTail(const struct page *page) { return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page); } static __always_inline int PageCompound(const struct page *page) { return test_bit(PG_head, &page->flags.f) || READ_ONCE(page->compound_head) & 1; } #define PAGE_POISON_PATTERN -1l static inline int PagePoisoned(const struct page *page) { return READ_ONCE(page->flags.f) == PAGE_POISON_PATTERN; } #ifdef CONFIG_DEBUG_VM void page_init_poison(struct page *page, size_t size); #else static inline void page_init_poison(struct page *page, size_t size) { } #endif static const unsigned long *const_folio_flags(const struct folio *folio, unsigned n) { const struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); return &page[n].flags.f; } static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags.f), page); return &page[n].flags.f; } /* * Page flags policies wrt compound pages * * PF_POISONED_CHECK * check if this struct page poisoned/uninitialized * * PF_ANY: * the page flag is relevant for small, head and tail pages. * * PF_HEAD: * for compound page all operations related to the page flag applied to * head page. * * PF_NO_TAIL: * modifications of the page flag must be done on small or head pages, * checks can be done on tail pages too. * * PF_NO_COMPOUND: * the page flag is not relevant for compound pages. * * PF_SECOND: * the page flag is stored in the first tail page. */ #define PF_POISONED_CHECK(page) ({ \ VM_BUG_ON_PGFLAGS(PagePoisoned(page), page); \ page; }) #define PF_ANY(page, enforce) PF_POISONED_CHECK(page) #define PF_HEAD(page, enforce) PF_POISONED_CHECK(compound_head(page)) #define PF_NO_TAIL(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ PF_POISONED_CHECK(compound_head(page)); }) #define PF_NO_COMPOUND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ PF_POISONED_CHECK(page); }) #define PF_SECOND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(!PageHead(page), page); \ PF_POISONED_CHECK(&page[1]); }) /* Which page is the flag stored in */ #define FOLIO_PF_ANY 0 #define FOLIO_PF_HEAD 0 #define FOLIO_PF_NO_TAIL 0 #define FOLIO_PF_NO_COMPOUND 0 #define FOLIO_PF_SECOND 1 #define FOLIO_HEAD_PAGE 0 #define FOLIO_SECOND_PAGE 1 /* * Macros to create function definitions for page flags */ #define FOLIO_TEST_FLAG(name, page) \ static __always_inline bool folio_test_##name(const struct folio *folio) \ { return test_bit(PG_##name, const_folio_flags(folio, page)); } #define FOLIO_SET_FLAG(name, page) \ static __always_inline void folio_set_##name(struct folio *folio) \ { set_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_CLEAR_FLAG(name, page) \ static __always_inline void folio_clear_##name(struct folio *folio) \ { clear_bit(PG_##name, folio_flags(folio, page)); } #define __FOLIO_SET_FLAG(name, page) \ static __always_inline void __folio_set_##name(struct folio *folio) \ { __set_bit(PG_##name, folio_flags(folio, page)); } #define __FOLIO_CLEAR_FLAG(name, page) \ static __always_inline void __folio_clear_##name(struct folio *folio) \ { __clear_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_TEST_SET_FLAG(name, page) \ static __always_inline bool folio_test_set_##name(struct folio *folio) \ { return test_and_set_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_TEST_CLEAR_FLAG(name, page) \ static __always_inline bool folio_test_clear_##name(struct folio *folio) \ { return test_and_clear_bit(PG_##name, folio_flags(folio, page)); } #define FOLIO_FLAG(name, page) \ FOLIO_TEST_FLAG(name, page) \ FOLIO_SET_FLAG(name, page) \ FOLIO_CLEAR_FLAG(name, page) #define TESTPAGEFLAG(uname, lname, policy) \ FOLIO_TEST_FLAG(lname, FOLIO_##policy) \ static __always_inline int Page##uname(const struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags.f); } #define SETPAGEFLAG(uname, lname, policy) \ FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void SetPage##uname(struct page *page) \ { set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define CLEARPAGEFLAG(uname, lname, policy) \ FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void ClearPage##uname(struct page *page) \ { clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __SETPAGEFLAG(uname, lname, policy) \ __FOLIO_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline void __SetPage##uname(struct page *page) \ { __set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define __CLEARPAGEFLAG(uname, lname, policy) \ __FOLIO_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline void __ClearPage##uname(struct page *page) \ { __clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTSETFLAG(uname, lname, policy) \ FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestSetPage##uname(struct page *page) \ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags.f); } #define TESTCLEARFLAG(uname, lname, policy) \ FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy) \ static __always_inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags.f); } #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ SETPAGEFLAG(uname, lname, policy) \ CLEARPAGEFLAG(uname, lname, policy) #define __PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ __SETPAGEFLAG(uname, lname, policy) \ __CLEARPAGEFLAG(uname, lname, policy) #define TESTSCFLAG(uname, lname, policy) \ TESTSETFLAG(uname, lname, policy) \ TESTCLEARFLAG(uname, lname, policy) #define FOLIO_TEST_FLAG_FALSE(name) \ static inline bool folio_test_##name(const struct folio *folio) \ { return false; } #define FOLIO_SET_FLAG_NOOP(name) \ static inline void folio_set_##name(struct folio *folio) { } #define FOLIO_CLEAR_FLAG_NOOP(name) \ static inline void folio_clear_##name(struct folio *folio) { } #define __FOLIO_SET_FLAG_NOOP(name) \ static inline void __folio_set_##name(struct folio *folio) { } #define __FOLIO_CLEAR_FLAG_NOOP(name) \ static inline void __folio_clear_##name(struct folio *folio) { } #define FOLIO_TEST_SET_FLAG_FALSE(name) \ static inline bool folio_test_set_##name(struct folio *folio) \ { return false; } #define FOLIO_TEST_CLEAR_FLAG_FALSE(name) \ static inline bool folio_test_clear_##name(struct folio *folio) \ { return false; } #define FOLIO_FLAG_FALSE(name) \ FOLIO_TEST_FLAG_FALSE(name) \ FOLIO_SET_FLAG_NOOP(name) \ FOLIO_CLEAR_FLAG_NOOP(name) #define TESTPAGEFLAG_FALSE(uname, lname) \ FOLIO_TEST_FLAG_FALSE(lname) \ static inline int Page##uname(const struct page *page) { return 0; } #define SETPAGEFLAG_NOOP(uname, lname) \ FOLIO_SET_FLAG_NOOP(lname) \ static inline void SetPage##uname(struct page *page) { } #define CLEARPAGEFLAG_NOOP(uname, lname) \ FOLIO_CLEAR_FLAG_NOOP(lname) \ static inline void ClearPage##uname(struct page *page) { } #define __CLEARPAGEFLAG_NOOP(uname, lname) \ __FOLIO_CLEAR_FLAG_NOOP(lname) \ static inline void __ClearPage##uname(struct page *page) { } #define TESTSETFLAG_FALSE(uname, lname) \ FOLIO_TEST_SET_FLAG_FALSE(lname) \ static inline int TestSetPage##uname(struct page *page) { return 0; } #define TESTCLEARFLAG_FALSE(uname, lname) \ FOLIO_TEST_CLEAR_FLAG_FALSE(lname) \ static inline int TestClearPage##uname(struct page *page) { return 0; } #define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname) \ SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname) #define TESTSCFLAG_FALSE(uname, lname) \ TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE) FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE) PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) TESTCLEARFLAG(LRU, lru, PF_HEAD) FOLIO_FLAG(active, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(active, FOLIO_HEAD_PAGE) PAGEFLAG(Workingset, workingset, PF_HEAD) TESTCLEARFLAG(Workingset, workingset, PF_HEAD) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ /* Xen */ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND) PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND); PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(swapbacked, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(swapbacked, FOLIO_HEAD_PAGE) /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. * - PG_private and PG_private_2 cause release_folio() and co to be invoked */ PAGEFLAG(Private, private, PF_ANY) FOLIO_FLAG(private_2, FOLIO_HEAD_PAGE) /* owner_2 can be set on tail pages for anon memory */ FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE) /* * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. */ TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL) TESTSCFLAG(Writeback, writeback, PF_NO_TAIL) FOLIO_FLAG(mappedtodisk, FOLIO_HEAD_PAGE) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE) __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE) #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not * available at this point. */ #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) #define folio_test_highmem(__f) is_highmem_idx(folio_zonenum(__f)) #else PAGEFLAG_FALSE(HighMem, highmem) #endif #define PhysHighMem(__p) (PageHighMem(phys_to_page(__p))) /* Does kmap_local_folio() only allow access to one page of the folio? */ #ifdef CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP #define folio_test_partial_kmap(f) true #else #define folio_test_partial_kmap(f) folio_test_highmem(f) #endif #ifdef CONFIG_SWAP static __always_inline bool folio_test_swapcache(const struct folio *folio) { return folio_test_swapbacked(folio) && test_bit(PG_swapcache, const_folio_flags(folio, 0)); } FOLIO_SET_FLAG(swapcache, FOLIO_HEAD_PAGE) FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE) #else FOLIO_FLAG_FALSE(swapcache) #endif FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE) #ifdef CONFIG_MMU FOLIO_FLAG(mlocked, FOLIO_HEAD_PAGE) __FOLIO_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE) FOLIO_TEST_SET_FLAG(mlocked, FOLIO_HEAD_PAGE) #else FOLIO_FLAG_FALSE(mlocked) __FOLIO_CLEAR_FLAG_NOOP(mlocked) FOLIO_TEST_CLEAR_FLAG_FALSE(mlocked) FOLIO_TEST_SET_FLAG_FALSE(mlocked) #endif #ifdef CONFIG_MEMORY_FAILURE PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 #endif #ifdef CONFIG_PAGE_IDLE_FLAG #ifdef CONFIG_64BIT FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE) FOLIO_FLAG(idle, FOLIO_HEAD_PAGE) #endif /* See page_idle.h for !64BIT workaround */ #else /* !CONFIG_PAGE_IDLE_FLAG */ FOLIO_FLAG_FALSE(young) FOLIO_TEST_CLEAR_FLAG_FALSE(young) FOLIO_FLAG_FALSE(idle) #endif /* * PageReported() is used to track reported free pages within the Buddy * allocator. We can use the non-atomic version of the test and set * operations as both should be shielded with the zone lock to prevent * any possible races on the setting or clearing of the bit. */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) #ifdef CONFIG_MEMORY_HOTPLUG PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY) #else PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #endif /* * On an anonymous folio mapped into a user virtual memory area, * folio->mapping points to its anon_vma, not to a struct address_space; * with the FOLIO_MAPPING_ANON bit set to distinguish it. See rmap.h. * * On an anonymous folio in a VM_MERGEABLE area, if CONFIG_KSM is enabled, * the FOLIO_MAPPING_ANON_KSM bit may be set along with the FOLIO_MAPPING_ANON * bit; and then folio->mapping points, not to an anon_vma, but to a private * structure which KSM associates with that merged folio. See ksm.h. * * Please note that, confusingly, "folio_mapping" refers to the inode * address_space which maps the folio from disk; whereas "folio_mapped" * refers to user virtual address space into which the folio is mapped. * * For slab pages, since slab reuses the bits in struct page to store its * internal states, the folio->mapping does not exist as such, nor do * these flags below. So in order to avoid testing non-existent bits, * please make sure that folio_test_slab(folio) actually evaluates to * false before calling the following functions (e.g., folio_test_anon). * See mm/slab.h. */ #define FOLIO_MAPPING_ANON 0x1 #define FOLIO_MAPPING_ANON_KSM 0x2 #define FOLIO_MAPPING_KSM (FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM) #define FOLIO_MAPPING_FLAGS (FOLIO_MAPPING_ANON | FOLIO_MAPPING_ANON_KSM) static __always_inline bool folio_test_anon(const struct folio *folio) { return ((unsigned long)folio->mapping & FOLIO_MAPPING_ANON) != 0; } static __always_inline bool PageAnonNotKsm(const struct page *page) { unsigned long flags = (unsigned long)page_folio(page)->mapping; return (flags & FOLIO_MAPPING_FLAGS) == FOLIO_MAPPING_ANON; } static __always_inline bool PageAnon(const struct page *page) { return folio_test_anon(page_folio(page)); } #ifdef CONFIG_KSM /* * A KSM page is one of those write-protected "shared pages" or "merged pages" * which KSM maps into multiple mms, wherever identical anonymous page content * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ static __always_inline bool folio_test_ksm(const struct folio *folio) { return ((unsigned long)folio->mapping & FOLIO_MAPPING_FLAGS) == FOLIO_MAPPING_KSM; } #else FOLIO_TEST_FLAG_FALSE(ksm) #endif u64 stable_page_flags(const struct page *page); /** * folio_xor_flags_has_waiters - Change some folio flags. * @folio: The folio. * @mask: Bits set in this word will be changed. * * This must only be used for flags which are changed with the folio * lock held. For example, it is unsafe to use for PG_dirty as that * can be set without the folio lock held. It can also only be used * on flags which are in the range 0-6 as some of the implementations * only affect those bits. * * Return: Whether there are tasks waiting on the folio. */ static inline bool folio_xor_flags_has_waiters(struct folio *folio, unsigned long mask) { return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0)); } /** * folio_test_uptodate - Is this folio up to date? * @folio: The folio. * * The uptodate flag is set on a folio when every byte in the folio is * at least as new as the corresponding bytes on storage. Anonymous * and CoW folios are always uptodate. If the folio is not uptodate, * some of the bytes in it may be; see the is_partially_uptodate() * address_space operation. */ static inline bool folio_test_uptodate(const struct folio *folio) { bool ret = test_bit(PG_uptodate, const_folio_flags(folio, 0)); /* * Must ensure that the data we read out of the folio is loaded * _after_ we've loaded folio->flags to check the uptodate bit. * We can skip the barrier if the folio is not uptodate, because * we wouldn't be reading anything from it. * * See folio_mark_uptodate() for the other side of the story. */ if (ret) smp_rmb(); return ret; } static inline bool PageUptodate(const struct page *page) { return folio_test_uptodate(page_folio(page)); } static __always_inline void __folio_mark_uptodate(struct folio *folio) { smp_wmb(); __set_bit(PG_uptodate, folio_flags(folio, 0)); } static __always_inline void folio_mark_uptodate(struct folio *folio) { /* * Memory barrier must be issued before setting the PG_uptodate bit, * so that all previous stores issued in order to bring the folio * uptodate are actually visible before folio_test_uptodate becomes true. */ smp_wmb(); set_bit(PG_uptodate, folio_flags(folio, 0)); } static __always_inline void __SetPageUptodate(struct page *page) { __folio_mark_uptodate((struct folio *)page); } static __always_inline void SetPageUptodate(struct page *page) { folio_mark_uptodate((struct folio *)page); } CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) void __folio_start_writeback(struct folio *folio, bool keep_write); void set_page_writeback(struct page *page); #define folio_start_writeback(folio) \ __folio_start_writeback(folio, false) static __always_inline bool folio_test_head(const struct folio *folio) { return test_bit(PG_head, const_folio_flags(folio, FOLIO_PF_ANY)); } static __always_inline int PageHead(const struct page *page) { PF_POISONED_CHECK(page); return test_bit(PG_head, &page->flags.f) && !page_is_fake_head(page); } __SETPAGEFLAG(Head, head, PF_ANY) __CLEARPAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) /** * folio_test_large() - Does this folio contain more than one page? * @folio: The folio to test. * * Return: True if the folio is larger than one page. */ static inline bool folio_test_large(const struct folio *folio) { return folio_test_head(folio); } static __always_inline void set_compound_head(struct page *page, struct page *head) { WRITE_ONCE(page->compound_head, (unsigned long)head + 1); } static __always_inline void clear_compound_head(struct page *page) { WRITE_ONCE(page->compound_head, 0); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void ClearPageCompound(struct page *page) { BUG_ON(!PageHead(page)); ClearPageHead(page); } FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE) FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(large_rmappable) FOLIO_FLAG_FALSE(partially_mapped) #endif #define PG_head_mask ((1UL << PG_head)) #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ static inline int PageTransCompound(const struct page *page) { return PageCompound(page); } #else TESTPAGEFLAG_FALSE(TransCompound, transcompound) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) /* * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the * compound page. * * This flag is set by hwpoison handler. Cleared by THP split or free page. */ FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE) #else FOLIO_FLAG_FALSE(has_hwpoisoned) #endif /* * For pages that do not use mapcount, page_type may be used. * The low 24 bits of pagetype may be used for your own purposes, as long * as you are careful to not affect the top 8 bits. The low bits of * pagetype will be overwritten when you clear the page_type from the page. */ enum pagetype { /* 0x00-0x7f are positive numbers, ie mapcount */ /* Reserve 0x80-0xef for mapcount overflow. */ PGTY_buddy = 0xf0, PGTY_offline = 0xf1, PGTY_table = 0xf2, PGTY_guard = 0xf3, PGTY_hugetlb = 0xf4, PGTY_slab = 0xf5, PGTY_zsmalloc = 0xf6, PGTY_unaccepted = 0xf7, PGTY_large_kmalloc = 0xf8, PGTY_mapcount_underflow = 0xff }; static inline bool page_type_has_type(int page_type) { return page_type < (PGTY_mapcount_underflow << 24); } /* This takes a mapcount which is one more than page->_mapcount */ static inline bool page_mapcount_is_type(unsigned int mapcount) { return page_type_has_type(mapcount - 1); } static inline bool page_has_type(const struct page *page) { return page_type_has_type(data_race(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ static __always_inline bool folio_test_##fname(const struct folio *folio) \ { \ return data_race(folio->page.page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __folio_set_##fname(struct folio *folio) \ { \ if (folio_test_##fname(folio)) \ return; \ VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX, \ folio); \ folio->page.page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __folio_clear_##fname(struct folio *folio) \ { \ if (folio->page.page_type == UINT_MAX) \ return; \ VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ folio->page.page_type = UINT_MAX; \ } #define PAGE_TYPE_OPS(uname, lname, fname) \ FOLIO_TYPE_OPS(lname, fname) \ static __always_inline int Page##uname(const struct page *page) \ { \ return data_race(page->page_type >> 24) == PGTY_##lname; \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ if (Page##uname(page)) \ return; \ VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page); \ page->page_type = (unsigned int)PGTY_##lname << 24; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ if (page->page_type == UINT_MAX) \ return; \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type = UINT_MAX; \ } /* * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ PAGE_TYPE_OPS(Buddy, buddy, buddy) /* * PageOffline() indicates that the page is logically offline although the * containing section is online. (e.g. inflated in a balloon driver or * not onlined when onlining the section). * The content of these pages is effectively stale. Such pages should not * be touched (read/write/dump/save) except by their owner. * * When a memory block gets onlined, all pages are initialized with a * refcount of 1 and PageOffline(). generic_online_page() will * take care of clearing PageOffline(). * * If a driver wants to allow to offline unmovable PageOffline() pages without * putting them back to the buddy, it can do so via the memory notifier by * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline() * pages (now with a reference count of zero) are treated like free (unmanaged) * pages, allowing the containing memory block to get offlined. A driver that * relies on this feature is aware that re-onlining the memory block will * require not giving them to the buddy via generic_online_page(). * * Memory offlining code will not adjust the managed page count for any * PageOffline() pages, treating them like they were never exposed to the * buddy using generic_online_page(). * * There are drivers that mark a page PageOffline() and expect there won't be * any further access to page content. PFN walkers that read content of random * pages should check PageOffline() and synchronize with such drivers using * page_offline_freeze()/page_offline_thaw(). */ PAGE_TYPE_OPS(Offline, offline, offline) extern void page_offline_freeze(void); extern void page_offline_thaw(void); extern void page_offline_begin(void); extern void page_offline_end(void); /* * Marks pages in use as page tables. */ PAGE_TYPE_OPS(Table, table, pgtable) /* * Marks guardpages used with debug_pagealloc. */ PAGE_TYPE_OPS(Guard, guard, guard) FOLIO_TYPE_OPS(slab, slab) /** * PageSlab - Determine if the page belongs to the slab allocator * @page: The page to test. * * Context: Any context. * Return: True for slab pages, false for any other kind of page. */ static inline bool PageSlab(const struct page *page) { return folio_test_slab(page_folio(page)); } #ifdef CONFIG_HUGETLB_PAGE FOLIO_TYPE_OPS(hugetlb, hugetlb) #else FOLIO_TEST_FLAG_FALSE(hugetlb) #endif PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc) /* * Mark pages that has to be accepted before touched for the first time. * * Serialized with zone lock. */ PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted) FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc) /** * PageHuge - Determine if the page belongs to hugetlbfs * @page: The page to test. * * Context: Any context. * Return: True for hugetlbfs pages, false for anon pages or pages * belonging to other filesystems. */ static inline bool PageHuge(const struct page *page) { return folio_test_hugetlb(page_folio(page)); } /* * Check if a page is currently marked HWPoisoned. Note that this check is * best effort only and inherently racy: there is no way to synchronize with * failing hardware. */ static inline bool is_page_hwpoison(const struct page *page) { const struct folio *folio; if (PageHWPoison(page)) return true; folio = page_folio(page); return folio_test_hugetlb(folio) && PageHWPoison(&folio->page); } static inline bool folio_contain_hwpoisoned_page(struct folio *folio) { return folio_test_hwpoison(folio) || (folio_test_large(folio) && folio_test_has_hwpoisoned(folio)); } bool is_free_buddy_page(const struct page *page); #ifdef CONFIG_MIGRATION /* * This page is migratable through movable_ops (for selected typed pages * only). * * Page migration of such pages might fail, for example, if the page is * already isolated by somebody else, or if the page is about to get freed. * * While a subsystem might set selected typed pages that support page migration * as being movable through movable_ops, it must never clear this flag. * * This flag is only cleared when the page is freed back to the buddy. * * Only selected page types support this flag (see page_movable_ops()) and * the flag might be used in other context for other pages. Always use * page_has_movable_ops() instead. */ TESTPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL); SETPAGEFLAG(MovableOps, movable_ops, PF_NO_TAIL); /* * A movable_ops page has this flag set while it is isolated for migration. * This flag primarily protects against concurrent migration attempts. * * Once migration ended (success or failure), the flag is cleared. The * flag is managed by the migration core. */ PAGEFLAG(MovableOpsIsolated, movable_ops_isolated, PF_NO_TAIL); #else /* !CONFIG_MIGRATION */ TESTPAGEFLAG_FALSE(MovableOps, movable_ops); SETPAGEFLAG_NOOP(MovableOps, movable_ops); PAGEFLAG_FALSE(MovableOpsIsolated, movable_ops_isolated); #endif /* CONFIG_MIGRATION */ /** * page_has_movable_ops - test for a movable_ops page * @page: The page to test. * * Test whether this is a movable_ops page. Such pages will stay that * way until freed. * * Returns true if this is a movable_ops page, otherwise false. */ static inline bool page_has_movable_ops(const struct page *page) { return PageMovableOps(page) && (PageOffline(page) || PageZsmalloc(page)); } static __always_inline int PageAnonExclusive(const struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); /* * HugeTLB stores this information on the head page; THP keeps it per * page */ if (PageHuge(page)) page = compound_head(page); return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void SetPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } static __always_inline void __ClearPageAnonExclusive(struct page *page) { VM_BUG_ON_PGFLAGS(!PageAnon(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags.f); } #ifdef CONFIG_MMU #define __PG_MLOCKED (1UL << PG_mlocked) #else #define __PG_MLOCKED 0 #endif /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. If they are, there is a problem. */ #define PAGE_FLAGS_CHECK_AT_FREE \ (1UL << PG_lru | 1UL << PG_locked | \ 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_active | \ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) /* * Flags checked when a page is prepped for return by the page allocator. * Pages being prepped should not have these flags set. If they are set, * there has been a kernel bug or struct page corruption. * * __PG_HWPOISON is exceptional because it needs to be kept beyond page's * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) /* * Flags stored in the second page of a compound page. They may overlap * the CHECK_AT_FREE flags above, so need to be cleared. */ #define PAGE_FLAGS_SECOND \ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ 1UL << PG_large_rmappable | 1UL << PG_partially_mapped) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) /** * folio_has_private - Determine if folio has private stuff * @folio: The folio to be checked * * Determine if a folio has private stuff, indicating that release routines * should be invoked upon it. */ static inline int folio_has_private(const struct folio *folio) { return !!(folio->flags.f & PAGE_FLAGS_PRIVATE); } #undef PF_ANY #undef PF_HEAD #undef PF_NO_TAIL #undef PF_NO_COMPOUND #undef PF_SECOND #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */ |
| 33 6 30 30 30 8 24 30 25 19 24 7 25 25 25 1 25 30 33 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/highmem.h> #include <linux/gfp.h> #include <linux/cpu.h> #include <linux/export.h> #include "rds.h" struct rds_page_remainder { struct page *r_page; unsigned long r_offset; local_lock_t bh_lock; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; /** * rds_page_remainder_alloc - build up regions of a message. * * @scat: Scatter list for message * @bytes: the number of bytes needed. * @gfp: the waiting behaviour of the allocation * * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to * kmap the pages, etc. * * If @bytes is at least a full page then this just returns a page from * alloc_page(). * * If @bytes is a partial page then this stores the unused region of the * page in a per-cpu structure. Future partial-page allocations may be * satisfied from that cached region. This lets us waste less memory on * small allocations with minimal complexity. It works because the transmit * path passes read-only page regions down to devices. They hold a page * reference until they are done with the region. */ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, gfp_t gfp) { struct rds_page_remainder *rem; struct page *page; int ret; gfp |= __GFP_HIGHMEM; /* jump straight to allocation if we're trying for a huge page */ if (bytes >= PAGE_SIZE) { page = alloc_page(gfp); if (!page) { ret = -ENOMEM; } else { sg_set_page(scat, page, PAGE_SIZE, 0); ret = 0; } goto out; } local_bh_disable(); local_lock_nested_bh(&rds_page_remainders.bh_lock); rem = this_cpu_ptr(&rds_page_remainders); while (1) { /* avoid a tiny region getting stuck by tossing it */ if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) { rds_stats_inc(s_page_remainder_miss); __free_page(rem->r_page); rem->r_page = NULL; } /* hand out a fragment from the cached page */ if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) { sg_set_page(scat, rem->r_page, bytes, rem->r_offset); get_page(sg_page(scat)); if (rem->r_offset != 0) rds_stats_inc(s_page_remainder_hit); rem->r_offset += ALIGN(bytes, 8); if (rem->r_offset >= PAGE_SIZE) { __free_page(rem->r_page); rem->r_page = NULL; } ret = 0; break; } /* alloc if there is nothing for us to use */ local_unlock_nested_bh(&rds_page_remainders.bh_lock); local_bh_enable(); page = alloc_page(gfp); local_bh_disable(); local_lock_nested_bh(&rds_page_remainders.bh_lock); rem = this_cpu_ptr(&rds_page_remainders); if (!page) { ret = -ENOMEM; break; } /* did someone race to fill the remainder before us? */ if (rem->r_page) { __free_page(page); continue; } /* otherwise install our page and loop around to alloc */ rem->r_page = page; rem->r_offset = 0; } local_unlock_nested_bh(&rds_page_remainders.bh_lock); local_bh_enable(); out: rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, ret ? NULL : sg_page(scat), ret ? 0 : scat->offset, ret ? 0 : scat->length); return ret; } EXPORT_SYMBOL_GPL(rds_page_remainder_alloc); void rds_page_exit(void) { unsigned int cpu; for_each_possible_cpu(cpu) { struct rds_page_remainder *rem; rem = &per_cpu(rds_page_remainders, cpu); rdsdebug("cpu %u\n", cpu); if (rem->r_page) __free_page(rem->r_page); rem->r_page = NULL; } } |
| 7 4 2 2 2 1 2 11 9 2 8 7 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 | // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for setting the MSS option in TCP packets. * * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> * Copyright (C) 2007 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/gfp.h> #include <linux/ipv6.h> #include <linux/tcp.h> #include <net/dst.h> #include <net/flow.h> #include <net/ipv6.h> #include <net/route.h> #include <net/tcp.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_tcpudp.h> #include <linux/netfilter/xt_TCPMSS.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_DESCRIPTION("Xtables: TCP Maximum Segment Size (MSS) adjustment"); MODULE_ALIAS("ipt_TCPMSS"); MODULE_ALIAS("ip6t_TCPMSS"); static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) { /* Beware zero-length options: make finite progress */ if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) return 1; else return opt[offset+1]; } static u_int32_t tcpmss_reverse_mtu(struct net *net, const struct sk_buff *skb, unsigned int family) { struct flowi fl; struct rtable *rt = NULL; u_int32_t mtu = ~0U; if (family == PF_INET) { struct flowi4 *fl4 = &fl.u.ip4; memset(fl4, 0, sizeof(*fl4)); fl4->daddr = ip_hdr(skb)->saddr; } else { struct flowi6 *fl6 = &fl.u.ip6; memset(fl6, 0, sizeof(*fl6)); fl6->daddr = ipv6_hdr(skb)->saddr; } nf_route(net, (struct dst_entry **)&rt, &fl, false, family); if (rt != NULL) { mtu = dst_mtu(&rt->dst); dst_release(&rt->dst); } return mtu; } static int tcpmss_mangle_packet(struct sk_buff *skb, const struct xt_action_param *par, unsigned int family, unsigned int tcphoff, unsigned int minlen) { const struct xt_tcpmss_info *info = par->targinfo; struct tcphdr *tcph; int len, tcp_hdrlen; unsigned int i; __be16 oldval; u16 newmss; u8 *opt; /* This is a fragment, no TCP header is available */ if (par->fragoff != 0) return 0; if (skb_ensure_writable(skb, skb->len)) return -1; len = skb->len - tcphoff; if (len < (int)sizeof(struct tcphdr)) return -1; tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); tcp_hdrlen = tcph->doff * 4; if (len < tcp_hdrlen || tcp_hdrlen < sizeof(struct tcphdr)) return -1; if (info->mss == XT_TCPMSS_CLAMP_PMTU) { struct net *net = xt_net(par); unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu); if (min_mtu <= minlen) { net_err_ratelimited("unknown or invalid path-MTU (%u)\n", min_mtu); return -1; } newmss = min_mtu - minlen; } else newmss = info->mss; opt = (u_int8_t *)tcph; for (i = sizeof(struct tcphdr); i <= tcp_hdrlen - TCPOLEN_MSS; i += optlen(opt, i)) { if (opt[i] == TCPOPT_MSS && opt[i+1] == TCPOLEN_MSS) { u_int16_t oldmss; oldmss = (opt[i+2] << 8) | opt[i+3]; /* Never increase MSS, even when setting it, as * doing so results in problems for hosts that rely * on MSS being set correctly. */ if (oldmss <= newmss) return 0; opt[i+2] = (newmss & 0xff00) >> 8; opt[i+3] = newmss & 0x00ff; inet_proto_csum_replace2(&tcph->check, skb, htons(oldmss), htons(newmss), false); return 0; } } /* There is data after the header so the option can't be added * without moving it, and doing so may make the SYN packet * itself too large. Accept the packet unmodified instead. */ if (len > tcp_hdrlen) return 0; /* tcph->doff has 4 bits, do not wrap it to 0 */ if (tcp_hdrlen >= 15 * 4) return 0; /* * MSS Option not found ?! add it.. */ if (skb_tailroom(skb) < TCPOLEN_MSS) { if (pskb_expand_head(skb, 0, TCPOLEN_MSS - skb_tailroom(skb), GFP_ATOMIC)) return -1; tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); } skb_put(skb, TCPOLEN_MSS); /* * IPv4: RFC 1122 states "If an MSS option is not received at * connection setup, TCP MUST assume a default send MSS of 536". * IPv6: RFC 2460 states IPv6 has a minimum MTU of 1280 and a minimum * length IPv6 header of 60, ergo the default MSS value is 1220 * Since no MSS was provided, we must use the default values */ if (xt_family(par) == NFPROTO_IPV4) newmss = min(newmss, (u16)536); else newmss = min(newmss, (u16)1220); opt = (u_int8_t *)tcph + sizeof(struct tcphdr); memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr)); inet_proto_csum_replace2(&tcph->check, skb, htons(len), htons(len + TCPOLEN_MSS), true); opt[0] = TCPOPT_MSS; opt[1] = TCPOLEN_MSS; opt[2] = (newmss & 0xff00) >> 8; opt[3] = newmss & 0x00ff; inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), false); oldval = ((__be16 *)tcph)[6]; tcph->doff += TCPOLEN_MSS/4; inet_proto_csum_replace2(&tcph->check, skb, oldval, ((__be16 *)tcph)[6], false); return TCPOLEN_MSS; } static unsigned int tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par) { struct iphdr *iph = ip_hdr(skb); __be16 newlen; int ret; ret = tcpmss_mangle_packet(skb, par, PF_INET, iph->ihl * 4, sizeof(*iph) + sizeof(struct tcphdr)); if (ret < 0) return NF_DROP; if (ret > 0) { iph = ip_hdr(skb); newlen = htons(ntohs(iph->tot_len) + ret); csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; } return XT_CONTINUE; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); u8 nexthdr; __be16 frag_off, oldlen, newlen; int tcphoff; int ret; nexthdr = ipv6h->nexthdr; tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off); if (tcphoff < 0) return NF_DROP; ret = tcpmss_mangle_packet(skb, par, PF_INET6, tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); if (ret < 0) return NF_DROP; if (ret > 0) { ipv6h = ipv6_hdr(skb); oldlen = ipv6h->payload_len; newlen = htons(ntohs(oldlen) + ret); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)oldlen), (__force __wsum)newlen); ipv6h->payload_len = newlen; } return XT_CONTINUE; } #endif /* Must specify -p tcp --syn */ static inline bool find_syn_match(const struct xt_entry_match *m) { const struct xt_tcp *tcpinfo = (const struct xt_tcp *)m->data; if (strcmp(m->u.kernel.match->name, "tcp") == 0 && tcpinfo->flg_cmp & TCPHDR_SYN && !(tcpinfo->invflags & XT_TCP_INV_FLAGS)) return true; return false; } static int tcpmss_tg4_check(const struct xt_tgchk_param *par) { const struct xt_tcpmss_info *info = par->targinfo; const struct ipt_entry *e = par->entryinfo; const struct xt_entry_match *ematch; if (info->mss == XT_TCPMSS_CLAMP_PMTU && (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } if (par->nft_compat) return 0; xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; pr_info_ratelimited("Only works on TCP SYN packets\n"); return -EINVAL; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static int tcpmss_tg6_check(const struct xt_tgchk_param *par) { const struct xt_tcpmss_info *info = par->targinfo; const struct ip6t_entry *e = par->entryinfo; const struct xt_entry_match *ematch; if (info->mss == XT_TCPMSS_CLAMP_PMTU && (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } if (par->nft_compat) return 0; xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; pr_info_ratelimited("Only works on TCP SYN packets\n"); return -EINVAL; } #endif static struct xt_target tcpmss_tg_reg[] __read_mostly = { { .family = NFPROTO_IPV4, .name = "TCPMSS", .checkentry = tcpmss_tg4_check, .target = tcpmss_tg4, .targetsize = sizeof(struct xt_tcpmss_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .family = NFPROTO_IPV6, .name = "TCPMSS", .checkentry = tcpmss_tg6_check, .target = tcpmss_tg6, .targetsize = sizeof(struct xt_tcpmss_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, #endif }; static int __init tcpmss_tg_init(void) { return xt_register_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); } static void __exit tcpmss_tg_exit(void) { xt_unregister_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); } module_init(tcpmss_tg_init); module_exit(tcpmss_tg_exit); |
| 802 324 2357 812 810 11 802 2 812 810 2 814 490 1964 915 1963 2193 2195 1678 1676 1229 490 1103 1231 2194 490 490 490 1638 1638 1196 322 954 727 1194 1639 1637 322 322 322 322 153 184 184 185 1 185 184 257 257 255 256 5 5 5 5 257 516 421 473 183 141 146 426 426 422 422 199 331 456 32 458 1625 1576 460 15 7 1 8 14 1402 1400 1399 1402 249 1194 42 42 42 42 15 38 4 8 42 42 42 42 130 132 42 5502 2556 7833 5278 248 126 3047 2445 180 6 679 679 9 121 124 2995 208 208 5327 9 210 210 209 210 209 22 191 191 20 210 209 210 209 203 208 204 204 22 22 5332 5333 5332 166 5220 5327 9 5277 157 157 5328 5330 2870 2870 2825 2392 2825 2826 103 103 2587 196 80 32 210 19 198 160 69 2476 2585 2531 2539 210 210 276 32 4 309 308 275 78 309 308 307 84 304 7 3 1 292 25 12 25 292 159 142 292 292 269 221 138 226 292 142 142 20 15 11 11 8 11 132 103 132 132 132 159 286 285 9 286 286 282 309 166 481 481 314 49 299 226 227 181 228 208 23 228 201 23 36 5 203 92 142 8 15 4 19 23 172 142 81 151 185 181 203 15 8 3 3 175 20 20 6 190 85 156 22 172 193 193 22 175 156 52 193 228 231 231 71 170 324 324 324 118 4 283 56 118 257 282 168 283 283 283 60 277 56 256 301 299 301 301 64 286 14 166 292 291 7 292 64 285 301 301 183 257 286 56 56 64 7 7 7 7 7 73 73 73 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 | /* * mm/rmap.c - physical to virtual reverse mappings * * Copyright 2001, Rik van Riel <riel@conectiva.com.br> * Released under the General Public License (GPL). * * Simple, low overhead reverse mapping scheme. * Please try to keep this thing as modular as possible. * * Provides methods for unmapping each kind of mapped page: * the anon methods track anonymous pages, and * the file methods track pages belonging to an inode. * * Original design by Rik van Riel <riel@conectiva.com.br> 2001 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 * Contributions by Hugh Dickins 2003, 2004 */ /* * Lock ordering in mm: * * inode->i_rwsem (while writing or truncating, not reading or faulting) * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) * folio_lock * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) * vma_start_write * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in block_dirty_folio) * i_pages lock (widely used) * lruvec->lru_lock (in folio_lruvec_lock_irq) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) * i_pages lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) * * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock * * hugetlbfs PageHuge() take locks in this order: * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * vma_lock (hugetlb specific lock for pmd_sharing) * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) * folio_lock */ #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/rcupdate.h> #include <linux/export.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/hugetlb.h> #include <linux/huge_mm.h> #include <linux/backing-dev.h> #include <linux/page_idle.h> #include <linux/memremap.h> #include <linux/userfaultfd_k.h> #include <linux/mm_inline.h> #include <linux/oom.h> #include <asm/tlbflush.h> #define CREATE_TRACE_POINTS #include <trace/events/migrate.h> #include "internal.h" static struct kmem_cache *anon_vma_cachep; static struct kmem_cache *anon_vma_chain_cachep; static inline struct anon_vma *anon_vma_alloc(void) { struct anon_vma *anon_vma; anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); if (anon_vma) { atomic_set(&anon_vma->refcount, 1); anon_vma->num_children = 0; anon_vma->num_active_vmas = 0; anon_vma->parent = anon_vma; /* * Initialise the anon_vma root to point to itself. If called * from fork, the root will be reset to the parents anon_vma. */ anon_vma->root = anon_vma; } return anon_vma; } static inline void anon_vma_free(struct anon_vma *anon_vma) { VM_BUG_ON(atomic_read(&anon_vma->refcount)); /* * Synchronize against folio_lock_anon_vma_read() such that * we can safely hold the lock without the anon_vma getting * freed. * * Relies on the full mb implied by the atomic_dec_and_test() from * put_anon_vma() against the acquire barrier implied by * down_read_trylock() from folio_lock_anon_vma_read(). This orders: * * folio_lock_anon_vma_read() VS put_anon_vma() * down_read_trylock() atomic_dec_and_test() * LOCK MB * atomic_read() rwsem_is_locked() * * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ might_sleep(); if (rwsem_is_locked(&anon_vma->root->rwsem)) { anon_vma_lock_write(anon_vma); anon_vma_unlock_write(anon_vma); } kmem_cache_free(anon_vma_cachep, anon_vma); } static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp) { return kmem_cache_alloc(anon_vma_chain_cachep, gfp); } static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) { kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); } static void anon_vma_chain_link(struct vm_area_struct *vma, struct anon_vma_chain *avc, struct anon_vma *anon_vma) { avc->vma = vma; avc->anon_vma = anon_vma; list_add(&avc->same_vma, &vma->anon_vma_chain); anon_vma_interval_tree_insert(avc, &anon_vma->rb_root); } /** * __anon_vma_prepare - attach an anon_vma to a memory region * @vma: the memory region in question * * This makes sure the memory mapping described by 'vma' has * an 'anon_vma' attached to it, so that we can associate the * anonymous pages mapped into it with that anon_vma. * * The common case will be that we already have one, which * is handled inline by anon_vma_prepare(). But if * not we either need to find an adjacent mapping that we * can re-use the anon_vma from (very common when the only * reason for splitting a vma has been mprotect()), or we * allocate a new one. * * Anon-vma allocations are very subtle, because we may have * optimistically looked up an anon_vma in folio_lock_anon_vma_read() * and that may actually touch the rwsem even in the newly * allocated vma (it depends on RCU to make sure that the * anon_vma isn't actually destroyed). * * As a result, we need to do proper anon_vma locking even * for the new allocation. At the same time, we do not want * to do any locking for the common case of already having * an anon_vma. */ int __anon_vma_prepare(struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; struct anon_vma *anon_vma, *allocated; struct anon_vma_chain *avc; mmap_assert_locked(mm); might_sleep(); avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_enomem; anon_vma = find_mergeable_anon_vma(vma); allocated = NULL; if (!anon_vma) { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) goto out_enomem_free_avc; anon_vma->num_children++; /* self-parent link for new root */ allocated = anon_vma; } anon_vma_lock_write(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; anon_vma_chain_link(vma, avc, anon_vma); anon_vma->num_active_vmas++; allocated = NULL; avc = NULL; } spin_unlock(&mm->page_table_lock); anon_vma_unlock_write(anon_vma); if (unlikely(allocated)) put_anon_vma(allocated); if (unlikely(avc)) anon_vma_chain_free(avc); return 0; out_enomem_free_avc: anon_vma_chain_free(avc); out_enomem: return -ENOMEM; } /* * This is a useful helper function for locking the anon_vma root as * we traverse the vma->anon_vma_chain, looping over anon_vma's that * have the same vma. * * Such anon_vma's should have the same root, so you'd expect to see * just a single mutex_lock for the whole traversal. */ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma) { struct anon_vma *new_root = anon_vma->root; if (new_root != root) { if (WARN_ON_ONCE(root)) up_write(&root->rwsem); root = new_root; down_write(&root->rwsem); } return root; } static inline void unlock_anon_vma_root(struct anon_vma *root) { if (root) up_write(&root->rwsem); } /* * Attach the anon_vmas from src to dst. * Returns 0 on success, -ENOMEM on failure. * * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(), * copy_vma() and anon_vma_fork(). The first four want an exact copy of src, * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before * call, we can identify this case by checking (!dst->anon_vma && * src->anon_vma). * * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find * and reuse existing anon_vma which has no vmas and only one child anon_vma. * This prevents degradation of anon_vma hierarchy to endless linear chain in * case of constantly forking task. On the other hand, an anon_vma with more * than one child isn't reused even if there was no alive vma, thus rmap * walker has a good chance of avoiding scanning the whole hierarchy when it * searches where page is mapped. */ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { struct anon_vma_chain *avc, *pavc; struct anon_vma *root = NULL; list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; avc = anon_vma_chain_alloc(GFP_NOWAIT); if (unlikely(!avc)) { unlock_anon_vma_root(root); root = NULL; avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto enomem_failure; } anon_vma = pavc->anon_vma; root = lock_anon_vma_root(root, anon_vma); anon_vma_chain_link(dst, avc, anon_vma); /* * Reuse existing anon_vma if it has no vma and only one * anon_vma child. * * Root anon_vma is never reused: * it has self-parent reference and at least one child. */ if (!dst->anon_vma && src->anon_vma && anon_vma->num_children < 2 && anon_vma->num_active_vmas == 0) dst->anon_vma = anon_vma; } if (dst->anon_vma) dst->anon_vma->num_active_vmas++; unlock_anon_vma_root(root); return 0; enomem_failure: /* * dst->anon_vma is dropped here otherwise its num_active_vmas can * be incorrectly decremented in unlink_anon_vmas(). * We can safely do this because callers of anon_vma_clone() don't care * about dst->anon_vma if anon_vma_clone() failed. */ dst->anon_vma = NULL; unlink_anon_vmas(dst); return -ENOMEM; } /* * Attach vma to its own anon_vma, as well as to the anon_vmas that * the corresponding VMA in the parent process is attached to. * Returns 0 on success, non-zero on failure. */ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) { struct anon_vma_chain *avc; struct anon_vma *anon_vma; int error; /* Don't bother if the parent process has no anon_vma here. */ if (!pvma->anon_vma) return 0; /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ vma->anon_vma = NULL; /* * First, attach the new VMA to the parent VMA's anon_vmas, * so rmap can find non-COWed pages in child processes. */ error = anon_vma_clone(vma, pvma); if (error) return error; /* An existing anon_vma has been reused, all done then. */ if (vma->anon_vma) return 0; /* Then add our own anon_vma. */ anon_vma = anon_vma_alloc(); if (!anon_vma) goto out_error; anon_vma->num_active_vmas++; avc = anon_vma_chain_alloc(GFP_KERNEL); if (!avc) goto out_error_free_anon_vma; /* * The root anon_vma's rwsem is the lock actually used when we * lock any of the anon_vmas in this anon_vma tree. */ anon_vma->root = pvma->anon_vma->root; anon_vma->parent = pvma->anon_vma; /* * With refcounts, an anon_vma can stay around longer than the * process it belongs to. The root anon_vma needs to be pinned until * this anon_vma is freed, because the lock lives in the root. */ get_anon_vma(anon_vma->root); /* Mark this anon_vma as the one where our new (COWed) pages go. */ vma->anon_vma = anon_vma; anon_vma_lock_write(anon_vma); anon_vma_chain_link(vma, avc, anon_vma); anon_vma->parent->num_children++; anon_vma_unlock_write(anon_vma); return 0; out_error_free_anon_vma: put_anon_vma(anon_vma); out_error: unlink_anon_vmas(vma); return -ENOMEM; } void unlink_anon_vmas(struct vm_area_struct *vma) { struct anon_vma_chain *avc, *next; struct anon_vma *root = NULL; /* * Unlink each anon_vma chained to the VMA. This list is ordered * from newest to oldest, ensuring the root anon_vma gets freed last. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; root = lock_anon_vma_root(root, anon_vma); anon_vma_interval_tree_remove(avc, &anon_vma->rb_root); /* * Leave empty anon_vmas on the list - we'll need * to free them outside the lock. */ if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) { anon_vma->parent->num_children--; continue; } list_del(&avc->same_vma); anon_vma_chain_free(avc); } if (vma->anon_vma) { vma->anon_vma->num_active_vmas--; /* * vma would still be needed after unlink, and anon_vma will be prepared * when handle fault. */ vma->anon_vma = NULL; } unlock_anon_vma_root(root); /* * Iterate the list once more, it now only contains empty and unlinked * anon_vmas, destroy them. Could not do before due to __put_anon_vma() * needing to write-acquire the anon_vma->root->rwsem. */ list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; VM_WARN_ON(anon_vma->num_children); VM_WARN_ON(anon_vma->num_active_vmas); put_anon_vma(anon_vma); list_del(&avc->same_vma); anon_vma_chain_free(avc); } } static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; init_rwsem(&anon_vma->rwsem); atomic_set(&anon_vma->refcount, 0); anon_vma->rb_root = RB_ROOT_CACHED; } void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, anon_vma_ctor); anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC|SLAB_ACCOUNT); } /* * Getting a lock on a stable anon_vma from a page off the LRU is tricky! * * Since there is no serialization what so ever against folio_remove_rmap_*() * the best this function can do is return a refcount increased anon_vma * that might have been relevant to this page. * * The page might have been remapped to a different anon_vma or the anon_vma * returned may already be freed (and even reused). * * In case it was remapped to a different anon_vma, the new anon_vma will be a * child of the old anon_vma, and the anon_vma lifetime rules will therefore * ensure that any anon_vma obtained from the page will still be valid for as * long as we observe page_mapped() [ hence all those page_mapped() tests ]. * * All users of this function must be very careful when walking the anon_vma * chain and verify that the page in question is indeed mapped in it * [ something equivalent to page_mapped_in_vma() ]. * * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from * folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid * if there is a mapcount, we can dereference the anon_vma after observing * those. * * NOTE: the caller should normally hold folio lock when calling this. If * not, the caller needs to double check the anon_vma didn't change after * taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it * concurrently without folio lock protection). See folio_lock_anon_vma_read() * which has already covered that, and comment above remap_pages(). */ struct anon_vma *folio_get_anon_vma(const struct folio *folio) { struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) goto out; if (!folio_mapped(folio)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON); if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } /* * If this folio is still mapped, then its anon_vma cannot have been * freed. But if it has been unmapped, we have no security against the * anon_vma structure being freed and reused (for another anon_vma: * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() * above cannot corrupt). */ if (!folio_mapped(folio)) { rcu_read_unlock(); put_anon_vma(anon_vma); return NULL; } out: rcu_read_unlock(); return anon_vma; } /* * Similar to folio_get_anon_vma() except it locks the anon_vma. * * Its a little more complex as it tries to keep the fast path to a single * atomic op -- the trylock. If we fail the trylock, we fall back to getting a * reference like with folio_get_anon_vma() and then block on the mutex * on !rwc->try_lock case. */ struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma = NULL; struct anon_vma *root_anon_vma; unsigned long anon_mapping; retry: rcu_read_lock(); anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & FOLIO_MAPPING_FLAGS) != FOLIO_MAPPING_ANON) goto out; if (!folio_mapped(folio)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - FOLIO_MAPPING_ANON); root_anon_vma = READ_ONCE(anon_vma->root); if (down_read_trylock(&root_anon_vma->rwsem)) { /* * folio_move_anon_rmap() might have changed the anon_vma as we * might not hold the folio lock here. */ if (unlikely((unsigned long)READ_ONCE(folio->mapping) != anon_mapping)) { up_read(&root_anon_vma->rwsem); rcu_read_unlock(); goto retry; } /* * If the folio is still mapped, then this anon_vma is still * its anon_vma, and holding the mutex ensures that it will * not go away, see anon_vma_free(). */ if (!folio_mapped(folio)) { up_read(&root_anon_vma->rwsem); anon_vma = NULL; } goto out; } if (rwc && rwc->try_lock) { anon_vma = NULL; rwc->contended = true; goto out; } /* trylock failed, we got to sleep */ if (!atomic_inc_not_zero(&anon_vma->refcount)) { anon_vma = NULL; goto out; } if (!folio_mapped(folio)) { rcu_read_unlock(); put_anon_vma(anon_vma); return NULL; } /* we pinned the anon_vma, its safe to sleep */ rcu_read_unlock(); anon_vma_lock_read(anon_vma); /* * folio_move_anon_rmap() might have changed the anon_vma as we might * not hold the folio lock here. */ if (unlikely((unsigned long)READ_ONCE(folio->mapping) != anon_mapping)) { anon_vma_unlock_read(anon_vma); put_anon_vma(anon_vma); anon_vma = NULL; goto retry; } if (atomic_dec_and_test(&anon_vma->refcount)) { /* * Oops, we held the last refcount, release the lock * and bail -- can't simply use put_anon_vma() because * we'll deadlock on the anon_vma_lock_write() recursion. */ anon_vma_unlock_read(anon_vma); __put_anon_vma(anon_vma); anon_vma = NULL; } return anon_vma; out: rcu_read_unlock(); return anon_vma; } #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* * Flush TLB entries for recently unmapped pages from remote CPUs. It is * important if a PTE was dirty when it was unmapped that it's flushed * before any IO is initiated on the page to prevent lost writes. Similarly, * it must be flushed before freeing to prevent data leakage. */ void try_to_unmap_flush(void) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; if (!tlb_ubc->flush_required) return; arch_tlbbatch_flush(&tlb_ubc->arch); tlb_ubc->flush_required = false; tlb_ubc->writable = false; } /* Flush iff there are potentially writable TLB entries that can race with IO */ void try_to_unmap_flush_dirty(void) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; if (tlb_ubc->writable) try_to_unmap_flush(); } /* * Bits 0-14 of mm->tlb_flush_batched record pending generations. * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations. */ #define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16 #define TLB_FLUSH_BATCH_PENDING_MASK \ ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1) #define TLB_FLUSH_BATCH_PENDING_LARGE \ (TLB_FLUSH_BATCH_PENDING_MASK / 2) static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, unsigned long start, unsigned long end) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; int batch; bool writable = pte_dirty(pteval); if (!pte_accessible(mm, pteval)) return; arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end); tlb_ubc->flush_required = true; /* * Ensure compiler does not re-order the setting of tlb_flush_batched * before the PTE is cleared. */ barrier(); batch = atomic_read(&mm->tlb_flush_batched); retry: if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) { /* * Prevent `pending' from catching up with `flushed' because of * overflow. Reset `pending' and `flushed' to be 1 and 0 if * `pending' becomes large. */ if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1)) goto retry; } else { atomic_inc(&mm->tlb_flush_batched); } /* * If the PTE was dirty then it's best to assume it's writable. The * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush() * before the page is queued for IO. */ if (writable) tlb_ubc->writable = true; } /* * Returns true if the TLB flush should be deferred to the end of a batch of * unmap operations to reduce IPIs. */ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) { if (!(flags & TTU_BATCH_FLUSH)) return false; return arch_tlbbatch_should_defer(mm); } /* * Reclaim unmaps pages under the PTL but do not flush the TLB prior to * releasing the PTL if TLB flushes are batched. It's possible for a parallel * operation such as mprotect or munmap to race between reclaim unmapping * the page and flushing the page. If this race occurs, it potentially allows * access to data via a stale TLB entry. Tracking all mm's that have TLB * batching in flight would be expensive during reclaim so instead track * whether TLB batching occurred in the past and if so then do a flush here * if required. This will cost one additional flush per reclaim cycle paid * by the first operation at risk such as mprotect and mumap. * * This must be called under the PTL so that an access to tlb_flush_batched * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise * via the PTL. */ void flush_tlb_batched_pending(struct mm_struct *mm) { int batch = atomic_read(&mm->tlb_flush_batched); int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK; int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; if (pending != flushed) { flush_tlb_mm(mm); /* * If the new TLB flushing is pending during flushing, leave * mm->tlb_flush_batched as is, to avoid losing flushing. */ atomic_cmpxchg(&mm->tlb_flush_batched, batch, pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT)); } } #else static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, unsigned long start, unsigned long end) { } static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) { return false; } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ /** * page_address_in_vma - The virtual address of a page in this VMA. * @folio: The folio containing the page. * @page: The page within the folio. * @vma: The VMA we need to know the address in. * * Calculates the user virtual address of this page in the specified VMA. * It is the caller's responsibility to check the page is actually * within the VMA. There may not currently be a PTE pointing at this * page, but if a page fault occurs at this address, this is the page * which will be accessed. * * Context: Caller should hold a reference to the folio. Caller should * hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the * VMA from being altered. * * Return: The virtual address corresponding to this page in the VMA. */ unsigned long page_address_in_vma(const struct folio *folio, const struct page *page, const struct vm_area_struct *vma) { if (folio_test_anon(folio)) { struct anon_vma *anon_vma = folio_anon_vma(folio); /* * Note: swapoff's unuse_vma() is more efficient with this * check, and needs it to match anon_vma when KSM is active. */ if (!vma->anon_vma || !anon_vma || vma->anon_vma->root != anon_vma->root) return -EFAULT; } else if (!vma->vm_file) { return -EFAULT; } else if (vma->vm_file->f_mapping != folio->mapping) { return -EFAULT; } /* KSM folios don't reach here because of the !anon_vma check */ return vma_address(vma, page_pgoff(folio, page), 1); } /* * Returns the actual pmd_t* where we expect 'address' to be mapped from, or * NULL if it doesn't exist. No guarantees / checks on what the pmd_t* * represents. */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd = NULL; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) goto out; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) goto out; pud = pud_offset(p4d, address); if (!pud_present(*pud)) goto out; pmd = pmd_offset(pud, address); out: return pmd; } struct folio_referenced_arg { int mapcount; int referenced; vm_flags_t vm_flags; struct mem_cgroup *memcg; }; /* * arg: folio_referenced_arg will be passed */ static bool folio_referenced_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct folio_referenced_arg *pra = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); int ptes = 0, referenced = 0; while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; if (vma->vm_flags & VM_LOCKED) { ptes++; pra->mapcount--; /* Only mlock fully mapped pages */ if (pvmw.pte && ptes != pvmw.nr_pages) continue; /* * All PTEs must be protected by page table lock in * order to mlock the page. * * If page table boundary has been cross, current ptl * only protect part of ptes. */ if (pvmw.flags & PVMW_PGTABLE_CROSSED) continue; /* Restore the mlock which got missed */ mlock_vma_folio(folio, vma); page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; return false; /* To break the loop */ } /* * Skip the non-shared swapbacked folio mapped solely by * the exiting or OOM-reaped process. This avoids redundant * swap-out followed by an immediate unmap. */ if ((!atomic_read(&vma->vm_mm->mm_users) || check_stable_address_space(vma->vm_mm)) && folio_test_anon(folio) && folio_test_swapbacked(folio) && !folio_maybe_mapped_shared(folio)) { pra->referenced = -1; page_vma_mapped_walk_done(&pvmw); return false; } if (lru_gen_enabled() && pvmw.pte) { if (lru_gen_look_around(&pvmw)) referenced++; } else if (pvmw.pte) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) referenced++; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_flush_young_notify(vma, address, pvmw.pmd)) referenced++; } else { /* unexpected pmd-mapped folio? */ WARN_ON_ONCE(1); } pra->mapcount--; } if (referenced) folio_clear_idle(folio); if (folio_test_clear_young(folio)) referenced++; if (referenced) { pra->referenced++; pra->vm_flags |= vma->vm_flags & ~VM_LOCKED; } if (!pra->mapcount) return false; /* To break the loop */ return true; } static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg) { struct folio_referenced_arg *pra = arg; struct mem_cgroup *memcg = pra->memcg; /* * Ignore references from this mapping if it has no recency. If the * folio has been used in another mapping, we will catch it; if this * other mapping is already gone, the unmap path will have set the * referenced flag or activated the folio in zap_pte_range(). */ if (!vma_has_recency(vma)) return true; /* * If we are reclaiming on behalf of a cgroup, skip counting on behalf * of references from different cgroups. */ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) return true; return false; } /** * folio_referenced() - Test if the folio was referenced. * @folio: The folio to test. * @is_locked: Caller holds lock on the folio. * @memcg: target memory cgroup * @vm_flags: A combination of all the vma->vm_flags which referenced the folio. * * Quick test_and_clear_referenced for all mappings of a folio, * * Return: The number of mappings which referenced the folio. Return -1 if * the function bailed out due to rmap lock contention. */ int folio_referenced(struct folio *folio, int is_locked, struct mem_cgroup *memcg, vm_flags_t *vm_flags) { bool we_locked = false; struct folio_referenced_arg pra = { .mapcount = folio_mapcount(folio), .memcg = memcg, }; struct rmap_walk_control rwc = { .rmap_one = folio_referenced_one, .arg = (void *)&pra, .anon_lock = folio_lock_anon_vma_read, .try_lock = true, .invalid_vma = invalid_folio_referenced_vma, }; *vm_flags = 0; if (!pra.mapcount) return 0; if (!folio_raw_mapping(folio)) return 0; if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) { we_locked = folio_trylock(folio); if (!we_locked) return 1; } rmap_walk(folio, &rwc); *vm_flags = pra.vm_flags; if (we_locked) folio_unlock(folio); return rwc.contended ? -1 : pra.referenced; } static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) { int cleaned = 0; struct vm_area_struct *vma = pvmw->vma; struct mmu_notifier_range range; unsigned long address = pvmw->address; /* * We have to assume the worse case ie pmd for invalidation. Note that * the folio can not be freed from this function. */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma->vm_mm, address, vma_address_end(pvmw)); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(pvmw)) { int ret = 0; address = pvmw->address; if (pvmw->pte) { pte_t *pte = pvmw->pte; pte_t entry = ptep_get(pte); /* * PFN swap PTEs, such as device-exclusive ones, that * actually map pages are clean and not writable from a * CPU perspective. The MMU notifier takes care of any * device aspects. */ if (!pte_present(entry)) continue; if (!pte_dirty(entry) && !pte_write(entry)) continue; flush_cache_page(vma, address, pte_pfn(entry)); entry = ptep_clear_flush(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(vma->vm_mm, address, pte, entry); ret = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t *pmd = pvmw->pmd; pmd_t entry; if (!pmd_dirty(*pmd) && !pmd_write(*pmd)) continue; flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); entry = pmdp_invalidate(vma, address, pmd); entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); ret = 1; #else /* unexpected pmd-mapped folio? */ WARN_ON_ONCE(1); #endif } if (ret) cleaned++; } mmu_notifier_invalidate_range_end(&range); return cleaned; } static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC); int *cleaned = arg; *cleaned += page_vma_mkclean_one(&pvmw); return true; } static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) { if (vma->vm_flags & VM_SHARED) return false; return true; } int folio_mkclean(struct folio *folio) { int cleaned = 0; struct address_space *mapping; struct rmap_walk_control rwc = { .arg = (void *)&cleaned, .rmap_one = page_mkclean_one, .invalid_vma = invalid_mkclean_vma, }; BUG_ON(!folio_test_locked(folio)); if (!folio_mapped(folio)) return 0; mapping = folio_mapping(folio); if (!mapping) return 0; rmap_walk(folio, &rwc); return cleaned; } EXPORT_SYMBOL_GPL(folio_mkclean); struct wrprotect_file_state { int cleaned; pgoff_t pgoff; unsigned long pfn; unsigned long nr_pages; }; static bool mapping_wrprotect_range_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg; struct page_vma_mapped_walk pvmw = { .pfn = state->pfn, .nr_pages = state->nr_pages, .pgoff = state->pgoff, .vma = vma, .address = address, .flags = PVMW_SYNC, }; state->cleaned += page_vma_mkclean_one(&pvmw); return true; } static void __rmap_walk_file(struct folio *folio, struct address_space *mapping, pgoff_t pgoff_start, unsigned long nr_pages, struct rmap_walk_control *rwc, bool locked); /** * mapping_wrprotect_range() - Write-protect all mappings in a specified range. * * @mapping: The mapping whose reverse mapping should be traversed. * @pgoff: The page offset at which @pfn is mapped within @mapping. * @pfn: The PFN of the page mapped in @mapping at @pgoff. * @nr_pages: The number of physically contiguous base pages spanned. * * Traverses the reverse mapping, finding all VMAs which contain a shared * mapping of the pages in the specified range in @mapping, and write-protects * them (that is, updates the page tables to mark the mappings read-only such * that a write protection fault arises when the mappings are written to). * * The @pfn value need not refer to a folio, but rather can reference a kernel * allocation which is mapped into userland. We therefore do not require that * the page maps to a folio with a valid mapping or index field, rather the * caller specifies these in @mapping and @pgoff. * * Return: the number of write-protected PTEs, or an error. */ int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff, unsigned long pfn, unsigned long nr_pages) { struct wrprotect_file_state state = { .cleaned = 0, .pgoff = pgoff, .pfn = pfn, .nr_pages = nr_pages, }; struct rmap_walk_control rwc = { .arg = (void *)&state, .rmap_one = mapping_wrprotect_range_one, .invalid_vma = invalid_mkclean_vma, }; if (!mapping) return 0; __rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc, /* locked = */false); return state.cleaned; } EXPORT_SYMBOL_GPL(mapping_wrprotect_range); /** * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of * [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff) * within the @vma of shared mappings. And since clean PTEs * should also be readonly, write protects them too. * @pfn: start pfn. * @nr_pages: number of physically contiguous pages srarting with @pfn. * @pgoff: page offset that the @pfn mapped with. * @vma: vma that @pfn mapped within. * * Returns the number of cleaned PTEs (including PMDs). */ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma) { struct page_vma_mapped_walk pvmw = { .pfn = pfn, .nr_pages = nr_pages, .pgoff = pgoff, .vma = vma, .flags = PVMW_SYNC, }; if (invalid_mkclean_vma(vma, NULL)) return 0; pvmw.address = vma_address(vma, pgoff, nr_pages); VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma); return page_vma_mkclean_one(&pvmw); } static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped) { int idx; if (nr) { idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED; __lruvec_stat_mod_folio(folio, idx, nr); } if (nr_pmdmapped) { if (folio_test_anon(folio)) { idx = NR_ANON_THPS; __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped); } else { /* NR_*_PMDMAPPED are not maintained per-memcg */ idx = folio_test_swapbacked(folio) ? NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED; __mod_node_page_state(folio_pgdat(folio), idx, nr_pmdmapped); } } } static __always_inline void __folio_add_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum pgtable_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; const int orig_nr_pages = nr_pages; int first = 0, nr = 0, nr_pmdmapped = 0; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_inc_and_test(&folio->_mapcount); break; } if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { nr = folio_add_return_large_mapcount(folio, orig_nr_pages, vma); if (nr == orig_nr_pages) /* Was completely unmapped. */ nr = folio_large_nr_pages(folio); else nr = 0; break; } do { first += atomic_inc_and_test(&page->_mapcount); } while (page++, --nr_pages > 0); if (first && atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED) nr = first; folio_add_large_mapcount(folio, orig_nr_pages, vma); break; case PGTABLE_LEVEL_PMD: case PGTABLE_LEVEL_PUD: first = atomic_inc_and_test(&folio->_entire_mapcount); if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { if (level == PGTABLE_LEVEL_PMD && first) nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_inc_return_large_mapcount(folio, vma); if (nr == 1) /* Was completely unmapped. */ nr = folio_large_nr_pages(folio); else nr = 0; break; } if (first) { nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped); if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) { nr_pages = folio_large_nr_pages(folio); /* * We only track PMD mappings of PMD-sized * folios separately. */ if (level == PGTABLE_LEVEL_PMD) nr_pmdmapped = nr_pages; nr = nr_pages - (nr & FOLIO_PAGES_MAPPED); /* Raced ahead of a remove and another add? */ if (unlikely(nr < 0)) nr = 0; } else { /* Raced ahead of a remove of ENTIRELY_MAPPED */ nr = 0; } } folio_inc_large_mapcount(folio, vma); break; default: BUILD_BUG(); } __folio_mod_stat(folio, nr, nr_pmdmapped); } /** * folio_move_anon_rmap - move a folio to our anon_vma * @folio: The folio to move to our anon_vma * @vma: The vma the folio belongs to * * When a folio belongs exclusively to one process after a COW event, * that folio can be moved into the anon_vma that belongs to just that * process, so the rmap code will not search the parent or sibling processes. */ void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma) { void *anon_vma = vma->anon_vma; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_VMA(!anon_vma, vma); anon_vma += FOLIO_MAPPING_ANON; /* * Ensure that anon_vma and the FOLIO_MAPPING_ANON bit are written * simultaneously, so a concurrent reader (eg folio_referenced()'s * folio_test_anon()) will not see one without the other. */ WRITE_ONCE(folio->mapping, anon_vma); } /** * __folio_set_anon - set up a new anonymous rmap for a folio * @folio: The folio to set up the new anonymous rmap for. * @vma: VM area to add the folio to. * @address: User virtual address of the mapping * @exclusive: Whether the folio is exclusive to the process. */ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma, unsigned long address, bool exclusive) { struct anon_vma *anon_vma = vma->anon_vma; BUG_ON(!anon_vma); /* * If the folio isn't exclusive to this vma, we must use the _oldest_ * possible anon_vma for the folio mapping! */ if (!exclusive) anon_vma = anon_vma->root; /* * page_idle does a lockless/optimistic rmap scan on folio->mapping. * Make sure the compiler doesn't split the stores of anon_vma and * the FOLIO_MAPPING_ANON type identifier, otherwise the rmap code * could mistake the mapping for a struct address_space and crash. */ anon_vma = (void *) anon_vma + FOLIO_MAPPING_ANON; WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma); folio->index = linear_page_index(vma, address); } /** * __page_check_anon_rmap - sanity check anonymous rmap addition * @folio: The folio containing @page. * @page: the page to check the mapping of * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped */ static void __page_check_anon_rmap(const struct folio *folio, const struct page *page, struct vm_area_struct *vma, unsigned long address) { /* * The page's anon-rmap details (mapping and index) are guaranteed to * be set up correctly at this point. * * We have exclusion against folio_add_anon_rmap_*() because the caller * always holds the page locked. * * We have exclusion against folio_add_new_anon_rmap because those pages * are initially only visible via the pagetables, and the pte is locked * over the call to folio_add_new_anon_rmap. */ VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root, folio); VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address), page); } static __always_inline void __folio_add_anon_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, unsigned long address, rmap_t flags, enum pgtable_level level) { int i; VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); __folio_add_rmap(folio, page, nr_pages, vma, level); if (likely(!folio_test_ksm(folio))) __page_check_anon_rmap(folio, page, vma, address); if (flags & RMAP_EXCLUSIVE) { switch (level) { case PGTABLE_LEVEL_PTE: for (i = 0; i < nr_pages; i++) SetPageAnonExclusive(page + i); break; case PGTABLE_LEVEL_PMD: SetPageAnonExclusive(page); break; case PGTABLE_LEVEL_PUD: /* * Keep the compiler happy, we don't support anonymous * PUD mappings. */ WARN_ON_ONCE(1); break; default: BUILD_BUG(); } } VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) && atomic_read(&folio->_mapcount) > 0, folio); for (i = 0; i < nr_pages; i++) { struct page *cur_page = page + i; VM_WARN_ON_FOLIO(folio_test_large(folio) && folio_entire_mapcount(folio) > 1 && PageAnonExclusive(cur_page), folio); if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) continue; /* * While PTE-mapping a THP we have a PMD and a PTE * mapping. */ VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > 0 && PageAnonExclusive(cur_page), folio); } /* * Only mlock it if the folio is fully mapped to the VMA. * * Partially mapped folios can be split on reclaim and part outside * of mlocked VMA can be evicted or freed. */ if (folio_nr_pages(folio) == nr_pages) mlock_vma_folio(folio, vma); } /** * folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio * @folio: The folio to add the mappings to * @page: The first page to add * @nr_pages: The number of pages which will be mapped * @vma: The vm area in which the mappings are added * @address: The user virtual address of the first page to map * @flags: The rmap flags * * The page range of folio is defined by [first_page, first_page + nr_pages) * * The caller needs to hold the page table lock, and the page must be locked in * the anon_vma case: to serialize mapping,index checking after setting, * and to ensure that an anon folio is not being upgraded racily to a KSM folio * (but KSM folios are never downgraded). */ void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags, PGTABLE_LEVEL_PTE); } /** * folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio * @folio: The folio to add the mapping to * @page: The first page to add * @vma: The vm area in which the mapping is added * @address: The user virtual address of the first page to map * @flags: The rmap flags * * The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock, and the page must be locked in * the anon_vma case: to serialize mapping,index checking after setting. */ void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif } /** * folio_add_new_anon_rmap - Add mapping to a new anonymous folio. * @folio: The folio to add the mapping to. * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * @flags: The rmap flags * * Like folio_add_anon_rmap_*() but must only be called on *new* folios. * This means the inc-and-test can be bypassed. * The folio doesn't necessarily need to be locked while it's exclusive * unless two threads map it concurrently. However, the folio must be * locked if it's shared. * * If the folio is pmd-mappable, it is accounted as a THP. */ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { const bool exclusive = flags & RMAP_EXCLUSIVE; int nr = 1, nr_pmdmapped = 0; VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio); /* * VM_DROPPABLE mappings don't swap; instead they're just dropped when * under memory pressure. */ if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE)) __folio_set_swapbacked(folio); __folio_set_anon(folio, vma, address, exclusive); if (likely(!folio_test_large(folio))) { /* increment count (starts at -1) */ atomic_set(&folio->_mapcount, 0); if (exclusive) SetPageAnonExclusive(&folio->page); } else if (!folio_test_pmd_mappable(folio)) { int i; nr = folio_large_nr_pages(folio); for (i = 0; i < nr; i++) { struct page *page = folio_page(folio, i); if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) /* increment count (starts at -1) */ atomic_set(&page->_mapcount, 0); if (exclusive) SetPageAnonExclusive(page); } folio_set_large_mapcount(folio, nr, vma); if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) atomic_set(&folio->_nr_pages_mapped, nr); } else { nr = folio_large_nr_pages(folio); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); folio_set_large_mapcount(folio, 1, vma); if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED); if (exclusive) SetPageAnonExclusive(&folio->page); nr_pmdmapped = nr; } VM_WARN_ON_ONCE(address < vma->vm_start || address + (nr << PAGE_SHIFT) > vma->vm_end); __folio_mod_stat(folio, nr, nr_pmdmapped); mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); } static __always_inline void __folio_add_file_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum pgtable_level level) { VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); __folio_add_rmap(folio, page, nr_pages, vma, level); /* * Only mlock it if the folio is fully mapped to the VMA. * * Partially mapped folios can be split on reclaim and part outside * of mlocked VMA can be evicted or freed. */ if (folio_nr_pages(folio) == nr_pages) mlock_vma_folio(folio, vma); } /** * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio * @folio: The folio to add the mappings to * @page: The first page to add * @nr_pages: The number of pages that will be mapped using PTEs * @vma: The vm area in which the mappings are added * * The page range of the folio is defined by [page, page + nr_pages) * * The caller needs to hold the page table lock. */ void folio_add_file_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { __folio_add_file_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); } /** * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio * @folio: The folio to add the mapping to * @page: The first page to add * @vma: The vm area in which the mapping is added * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock. */ void folio_add_file_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif } /** * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio * @folio: The folio to add the mapping to * @page: The first page to add * @vma: The vm area in which the mapping is added * * The page range of the folio is defined by [page, page + HPAGE_PUD_NR) * * The caller needs to hold the page table lock. */ void folio_add_file_rmap_pud(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); #else WARN_ON_ONCE(true); #endif } static __always_inline void __folio_remove_rmap(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma, enum pgtable_level level) { atomic_t *mapped = &folio->_nr_pages_mapped; int last = 0, nr = 0, nr_pmdmapped = 0; bool partially_mapped = false; __folio_rmap_sanity_checks(folio, page, nr_pages, level); switch (level) { case PGTABLE_LEVEL_PTE: if (!folio_test_large(folio)) { nr = atomic_add_negative(-1, &folio->_mapcount); break; } if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { nr = folio_sub_return_large_mapcount(folio, nr_pages, vma); if (!nr) { /* Now completely unmapped. */ nr = folio_large_nr_pages(folio); } else { partially_mapped = nr < folio_large_nr_pages(folio) && !folio_entire_mapcount(folio); nr = 0; } break; } folio_sub_large_mapcount(folio, nr_pages, vma); do { last += atomic_add_negative(-1, &page->_mapcount); } while (page++, --nr_pages > 0); if (last && atomic_sub_return_relaxed(last, mapped) < ENTIRELY_MAPPED) nr = last; partially_mapped = nr && atomic_read(mapped); break; case PGTABLE_LEVEL_PMD: case PGTABLE_LEVEL_PUD: if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { last = atomic_add_negative(-1, &folio->_entire_mapcount); if (level == PGTABLE_LEVEL_PMD && last) nr_pmdmapped = folio_large_nr_pages(folio); nr = folio_dec_return_large_mapcount(folio, vma); if (!nr) { /* Now completely unmapped. */ nr = folio_large_nr_pages(folio); } else { partially_mapped = last && nr < folio_large_nr_pages(folio); nr = 0; } break; } folio_dec_large_mapcount(folio, vma); last = atomic_add_negative(-1, &folio->_entire_mapcount); if (last) { nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped); if (likely(nr < ENTIRELY_MAPPED)) { nr_pages = folio_large_nr_pages(folio); if (level == PGTABLE_LEVEL_PMD) nr_pmdmapped = nr_pages; nr = nr_pages - nr; /* Raced ahead of another remove and an add? */ if (unlikely(nr < 0)) nr = 0; } else { /* An add of ENTIRELY_MAPPED raced ahead */ nr = 0; } } partially_mapped = nr && nr < nr_pmdmapped; break; default: BUILD_BUG(); } /* * Queue anon large folio for deferred split if at least one page of * the folio is unmapped and at least one page is still mapped. * * Check partially_mapped first to ensure it is a large folio. */ if (partially_mapped && folio_test_anon(folio) && !folio_test_partially_mapped(folio)) deferred_split_folio(folio, true); __folio_mod_stat(folio, -nr, -nr_pmdmapped); /* * It would be tidy to reset folio_test_anon mapping when fully * unmapped, but that might overwrite a racing folio_add_anon_rmap_*() * which increments mapcount after us but sets mapping before us: * so leave the reset to free_pages_prepare, and remember that * it's only reliable while mapped. */ munlock_vma_folio(folio, vma); } /** * folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio * @folio: The folio to remove the mappings from * @page: The first page to remove * @nr_pages: The number of pages that will be removed from the mapping * @vma: The vm area from which the mappings are removed * * The page range of the folio is defined by [page, page + nr_pages) * * The caller needs to hold the page table lock. */ void folio_remove_rmap_ptes(struct folio *folio, struct page *page, int nr_pages, struct vm_area_struct *vma) { __folio_remove_rmap(folio, page, nr_pages, vma, PGTABLE_LEVEL_PTE); } /** * folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio * @folio: The folio to remove the mapping from * @page: The first page to remove * @vma: The vm area from which the mapping is removed * * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) * * The caller needs to hold the page table lock. */ void folio_remove_rmap_pmd(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, PGTABLE_LEVEL_PMD); #else WARN_ON_ONCE(true); #endif } /** * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio * @folio: The folio to remove the mapping from * @page: The first page to remove * @vma: The vm area from which the mapping is removed * * The page range of the folio is defined by [page, page + HPAGE_PUD_NR) * * The caller needs to hold the page table lock. */ void folio_remove_rmap_pud(struct folio *folio, struct page *page, struct vm_area_struct *vma) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, PGTABLE_LEVEL_PUD); #else WARN_ON_ONCE(true); #endif } static inline unsigned int folio_unmap_pte_batch(struct folio *folio, struct page_vma_mapped_walk *pvmw, enum ttu_flags flags, pte_t pte) { unsigned long end_addr, addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; unsigned int max_nr; if (flags & TTU_HWPOISON) return 1; if (!folio_test_large(folio)) return 1; /* We may only batch within a single VMA and a single page table. */ end_addr = pmd_addr_end(addr, vma->vm_end); max_nr = (end_addr - addr) >> PAGE_SHIFT; /* We only support lazyfree batching for now ... */ if (!folio_test_anon(folio) || folio_test_swapbacked(folio)) return 1; if (pte_unused(pte)) return 1; return folio_pte_batch(folio, pvmw->pte, pte, max_nr); } /* * @arg: enum ttu_flags will be passed to this argument */ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); bool anon_exclusive, ret = true; pte_t pteval; struct page *subpage; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; unsigned long nr_pages = 1, end_addr; unsigned long pfn; unsigned long hsz = 0; int ptes = 0; /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(), * try_to_unmap() may return before page_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) pvmw.flags = PVMW_SYNC; /* * For THP, we have to assume the worse case ie pmd for invalidation. * For hugetlb, it could be much worse if we need to do pud * invalidation in the case of pmd sharing. * * Note that the folio can not be freed in this function as call of * try_to_unmap() must hold a reference on the folio. */ range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* * If sharing is possible, start and end will be adjusted * accordingly. */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* We need the huge page size for set_huge_pte_at() */ hsz = huge_page_size(hstate_vma(vma)); } mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { /* * If the folio is in an mlock()d vma, we must not swap it out. */ if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { ptes++; /* * Set 'ret' to indicate the page cannot be unmapped. * * Do not jump to walk_abort immediately as additional * iteration might be required to detect fully mapped * folio an mlock it. */ ret = false; /* Only mlock fully mapped pages */ if (pvmw.pte && ptes != pvmw.nr_pages) continue; /* * All PTEs must be protected by page table lock in * order to mlock the page. * * If page table boundary has been cross, current ptl * only protect part of ptes. */ if (pvmw.flags & PVMW_PGTABLE_CROSSED) goto walk_done; /* Restore the mlock which got missed */ mlock_vma_folio(folio, vma); goto walk_done; } if (!pvmw.pte) { if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) { if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio)) goto walk_done; /* * unmap_huge_pmd_locked has either already marked * the folio as swap-backed or decided to retain it * due to GUP or speculative references. */ goto walk_abort; } if (flags & TTU_SPLIT_HUGE_PMD) { /* * We temporarily have to drop the PTL and * restart so we can process the PTE-mapped THP. */ split_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, false); flags &= ~TTU_SPLIT_HUGE_PMD; page_vma_mapped_walk_restart(&pvmw); continue; } } /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); /* * Handle PFN swap PTEs, such as device-exclusive ones, that * actually map pages. */ pteval = ptep_get(pvmw.pte); if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } subpage = folio_page(folio, pfn - folio_pfn(folio)); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); if (folio_test_hugetlb(folio)) { bool anon = folio_test_anon(folio); /* * The try_to_unmap() is only passed a hugetlb page * in the case where the hugetlb page is poisoned. */ VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage); /* * huge_pmd_unshare may unmap an entire PMD page. * There is no way of knowing exactly which PMDs may * be cached for this mm, so we must flush them all. * start/end were already adjusted above to cover this * range. */ flush_cache_range(vma, range.start, range.end); /* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. * * We also must hold hugetlb vma_lock in write mode. * Lock order dictates acquiring vma_lock BEFORE * i_mmap_rwsem. We can only try lock here and fail * if unsuccessful. */ if (!anon) { VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) goto walk_abort; if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { hugetlb_vma_unlock_write(vma); flush_tlb_range(vma, range.start, range.end); /* * The ref count of the PMD page was * dropped which is part of the way map * counting is done for shared PMDs. * Return 'true' here. When there is * no other sharing, huge_pmd_unshare * returns false and we will unmap the * actual page and drop map count * to zero. */ goto walk_done; } hugetlb_vma_unlock_write(vma); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); if (pte_dirty(pteval)) folio_mark_dirty(folio); } else if (likely(pte_present(pteval))) { nr_pages = folio_unmap_pte_batch(folio, &pvmw, flags, pteval); end_addr = address + nr_pages * PAGE_SIZE; flush_cache_range(vma, address, end_addr); /* Nuke the page table entry. */ pteval = get_and_clear_ptes(mm, address, pvmw.pte, nr_pages); /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */ if (should_defer_flush(mm, flags)) set_tlb_ubc_flush_pending(mm, pteval, address, end_addr); else flush_tlb_range(vma, address, end_addr); if (pte_dirty(pteval)) folio_mark_dirty(folio); } else { pte_clear(mm, address, pvmw.pte); } /* * Now the pte is cleared. If this pte was uffd-wp armed, * we may want to replace a none pte with a marker pte if * it's file-backed, so we don't lose the tracking info. */ pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval); /* Update high watermark before we lower rss */ update_hiwater_rss(mm); if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) { pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); } else { dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } } else if (likely(pte_present(pteval)) && pte_unused(pteval) && !userfaultfd_armed(vma)) { /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan * will take care of the rest. * A future reference will then fault in a new zero * page. When userfaultfd is active, we must not drop * this page though, as its main user (postcopy * migration) will not expect userfaults on already * copied pages. */ dec_mm_counter(mm, mm_counter(folio)); } else if (folio_test_anon(folio)) { swp_entry_t entry = page_swap_entry(subpage); pte_t swp_pte; /* * Store the swap location in the pte. * See handle_pte_fault() ... */ if (unlikely(folio_test_swapbacked(folio) != folio_test_swapcache(folio))) { WARN_ON_ONCE(1); goto walk_abort; } /* MADV_FREE page check */ if (!folio_test_swapbacked(folio)) { int ref_count, map_count; /* * Synchronize with gup_pte_range(): * - clear PTE; barrier; read refcount * - inc refcount; barrier; read PTE */ smp_mb(); ref_count = folio_ref_count(folio); map_count = folio_mapcount(folio); /* * Order reads for page refcount and dirty flag * (see comments in __remove_mapping()). */ smp_rmb(); if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) { /* * redirtied either using the page table or a previously * obtained GUP reference. */ set_ptes(mm, address, pvmw.pte, pteval, nr_pages); folio_set_swapbacked(folio); goto walk_abort; } else if (ref_count != 1 + map_count) { /* * Additional reference. Could be a GUP reference or any * speculative reference. GUP users must mark the folio * dirty if there was a modification. This folio cannot be * reclaimed right now either way, so act just like nothing * happened. * We'll come back here later and detect if the folio was * dirtied when the additional reference is gone. */ set_ptes(mm, address, pvmw.pte, pteval, nr_pages); goto walk_abort; } add_mm_counter(mm, MM_ANONPAGES, -nr_pages); goto discard; } if (swap_duplicate(entry) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } /* * arch_unmap_one() is expected to be a NOP on * architectures where we could have PFN swap PTEs, * so we'll not check/care. */ if (arch_unmap_one(mm, vma, address, pteval) < 0) { swap_free(entry); set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } /* See folio_try_share_anon_rmap(): clear PTE first. */ if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, subpage)) { swap_free(entry); set_pte_at(mm, address, pvmw.pte, pteval); goto walk_abort; } if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); if (list_empty(&mm->mmlist)) list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } dec_mm_counter(mm, MM_ANONPAGES); inc_mm_counter(mm, MM_SWAPENTS); swp_pte = swp_entry_to_pte(entry); if (anon_exclusive) swp_pte = pte_swp_mkexclusive(swp_pte); if (likely(pte_present(pteval))) { if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); } else { if (pte_swp_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_swp_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); } set_pte_at(mm, address, pvmw.pte, swp_pte); } else { /* * This is a locked file-backed folio, * so it cannot be removed from the page * cache and replaced by a new folio before * mmu_notifier_invalidate_range_end, so no * concurrent thread might update its page table * to point at a new folio while a device is * still using this folio. * * See Documentation/mm/mmu_notifier.rst */ dec_mm_counter(mm, mm_counter_file(folio)); } discard: if (unlikely(folio_test_hugetlb(folio))) { hugetlb_remove_rmap(folio); } else { folio_remove_rmap_ptes(folio, subpage, nr_pages, vma); } if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); folio_put_refs(folio, nr_pages); /* * If we are sure that we batched the entire folio and cleared * all PTEs, we can just optimize and stop right here. */ if (nr_pages == folio_nr_pages(folio)) goto walk_done; continue; walk_abort: ret = false; walk_done: page_vma_mapped_walk_done(&pvmw); break; } mmu_notifier_invalidate_range_end(&range); return ret; } static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) { return vma_is_temporary_stack(vma); } static int folio_not_mapped(struct folio *folio) { return !folio_mapped(folio); } /** * try_to_unmap - Try to remove all page table mappings to a folio. * @folio: The folio to unmap. * @flags: action and flags * * Tries to remove all the page table entries which are mapping this * folio. It is the caller's responsibility to check if the folio is * still mapped if needed (use TTU_SYNC to prevent accounting races). * * Context: Caller must hold the folio lock. */ void try_to_unmap(struct folio *folio, enum ttu_flags flags) { struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = (void *)flags, .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, }; if (flags & TTU_RMAP_LOCKED) rmap_walk_locked(folio, &rwc); else rmap_walk(folio, &rwc); } /* * @arg: enum ttu_flags will be passed to this argument. * * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs * containing migration entries. */ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); bool anon_exclusive, writable, ret = true; pte_t pteval; struct page *subpage; struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; unsigned long pfn; unsigned long hsz = 0; /* * When racing against e.g. zap_pte_range() on another cpu, * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(), * try_to_migrate() may return before page_mapped() has become false, * if page table locking is skipped: use TTU_SYNC to wait for that. */ if (flags & TTU_SYNC) pvmw.flags = PVMW_SYNC; /* * For THP, we have to assume the worse case ie pmd for invalidation. * For hugetlb, it could be much worse if we need to do pud * invalidation in the case of pmd sharing. * * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ range.end = vma_address_end(&pvmw); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, address, range.end); if (folio_test_hugetlb(folio)) { /* * If sharing is possible, start and end will be adjusted * accordingly. */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); /* We need the huge page size for set_huge_pte_at() */ hsz = huge_page_size(hstate_vma(vma)); } mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { /* PMD-mapped THP migration entry */ if (!pvmw.pte) { if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, true); ret = false; page_vma_mapped_walk_done(&pvmw); break; } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION subpage = folio_page(folio, pmd_pfn(*pvmw.pmd) - folio_pfn(folio)); VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) || !folio_test_pmd_mappable(folio), folio); if (set_pmd_migration_entry(&pvmw, subpage)) { ret = false; page_vma_mapped_walk_done(&pvmw); break; } continue; #endif } /* Unexpected PMD-mapped THP? */ VM_BUG_ON_FOLIO(!pvmw.pte, folio); /* * Handle PFN swap PTEs, such as device-exclusive ones, that * actually map pages. */ pteval = ptep_get(pvmw.pte); if (likely(pte_present(pteval))) { pfn = pte_pfn(pteval); } else { pfn = swp_offset_pfn(pte_to_swp_entry(pteval)); VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); } subpage = folio_page(folio, pfn - folio_pfn(folio)); address = pvmw.address; anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(subpage); if (folio_test_hugetlb(folio)) { bool anon = folio_test_anon(folio); /* * huge_pmd_unshare may unmap an entire PMD page. * There is no way of knowing exactly which PMDs may * be cached for this mm, so we must flush them all. * start/end were already adjusted above to cover this * range. */ flush_cache_range(vma, range.start, range.end); /* * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. * * We also must hold hugetlb vma_lock in write mode. * Lock order dictates acquiring vma_lock BEFORE * i_mmap_rwsem. We can only try lock here and * fail if unsuccessful. */ if (!anon) { VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); if (!hugetlb_vma_trylock_write(vma)) { page_vma_mapped_walk_done(&pvmw); ret = false; break; } if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { hugetlb_vma_unlock_write(vma); flush_tlb_range(vma, range.start, range.end); /* * The ref count of the PMD page was * dropped which is part of the way map * counting is done for shared PMDs. * Return 'true' here. When there is * no other sharing, huge_pmd_unshare * returns false and we will unmap the * actual page and drop map count * to zero. */ page_vma_mapped_walk_done(&pvmw); break; } hugetlb_vma_unlock_write(vma); } /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); if (pte_dirty(pteval)) folio_mark_dirty(folio); writable = pte_write(pteval); } else if (likely(pte_present(pteval))) { flush_cache_page(vma, address, pfn); /* Nuke the page table entry. */ if (should_defer_flush(mm, flags)) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */ pteval = ptep_get_and_clear(mm, address, pvmw.pte); set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE); } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); } if (pte_dirty(pteval)) folio_mark_dirty(folio); writable = pte_write(pteval); } else { pte_clear(mm, address, pvmw.pte); writable = is_writable_device_private_entry(pte_to_swp_entry(pteval)); } VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) && !anon_exclusive, folio); /* Update high watermark before we lower rss */ update_hiwater_rss(mm); if (PageHWPoison(subpage)) { VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio); pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); } else { dec_mm_counter(mm, mm_counter(folio)); set_pte_at(mm, address, pvmw.pte, pteval); } } else if (likely(pte_present(pteval)) && pte_unused(pteval) && !userfaultfd_armed(vma)) { /* * The guest indicated that the page content is of no * interest anymore. Simply discard the pte, vmscan * will take care of the rest. * A future reference will then fault in a new zero * page. When userfaultfd is active, we must not drop * this page though, as its main user (postcopy * migration) will not expect userfaults on already * copied pages. */ dec_mm_counter(mm, mm_counter(folio)); } else { swp_entry_t entry; pte_t swp_pte; /* * arch_unmap_one() is expected to be a NOP on * architectures where we could have PFN swap PTEs, * so we'll not check/care. */ if (arch_unmap_one(mm, vma, address, pteval) < 0) { if (folio_test_hugetlb(folio)) set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); else set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } /* See folio_try_share_anon_rmap_pte(): clear PTE first. */ if (folio_test_hugetlb(folio)) { if (anon_exclusive && hugetlb_try_share_anon_rmap(folio)) { set_huge_pte_at(mm, address, pvmw.pte, pteval, hsz); ret = false; page_vma_mapped_walk_done(&pvmw); break; } } else if (anon_exclusive && folio_try_share_anon_rmap_pte(folio, subpage)) { set_pte_at(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ if (writable) entry = make_writable_migration_entry( page_to_pfn(subpage)); else if (anon_exclusive) entry = make_readable_exclusive_migration_entry( page_to_pfn(subpage)); else entry = make_readable_migration_entry( page_to_pfn(subpage)); if (likely(pte_present(pteval))) { if (pte_young(pteval)) entry = make_migration_entry_young(entry); if (pte_dirty(pteval)) entry = make_migration_entry_dirty(entry); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); } else { swp_pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_swp_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); } if (folio_test_hugetlb(folio)) set_huge_pte_at(mm, address, pvmw.pte, swp_pte, hsz); else set_pte_at(mm, address, pvmw.pte, swp_pte); trace_set_migration_pte(address, pte_val(swp_pte), folio_order(folio)); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. */ } if (unlikely(folio_test_hugetlb(folio))) hugetlb_remove_rmap(folio); else folio_remove_rmap_pte(folio, subpage, vma); if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); folio_put(folio); } mmu_notifier_invalidate_range_end(&range); return ret; } /** * try_to_migrate - try to replace all page table mappings with swap entries * @folio: the folio to replace page table entries for * @flags: action and flags * * Tries to remove all the page table entries which are mapping this folio and * replace them with special swap entries. Caller must hold the folio lock. */ void try_to_migrate(struct folio *folio, enum ttu_flags flags) { struct rmap_walk_control rwc = { .rmap_one = try_to_migrate_one, .arg = (void *)flags, .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, }; /* * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags. */ if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC | TTU_BATCH_FLUSH))) return; if (folio_is_zone_device(folio) && (!folio_is_device_private(folio) && !folio_is_device_coherent(folio))) return; /* * During exec, a temporary VMA is setup and later moved. * The VMA is moved under the anon_vma lock but not the * page tables leading to a race where migration cannot * find the migration ptes. Rather than increasing the * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ if (!folio_test_ksm(folio) && folio_test_anon(folio)) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) rmap_walk_locked(folio, &rwc); else rmap_walk(folio, &rwc); } #ifdef CONFIG_DEVICE_PRIVATE /** * make_device_exclusive() - Mark a page for exclusive use by a device * @mm: mm_struct of associated target process * @addr: the virtual address to mark for exclusive device access * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering * @foliop: folio pointer will be stored here on success. * * This function looks up the page mapped at the given address, grabs a * folio reference, locks the folio and replaces the PTE with special * device-exclusive PFN swap entry, preventing access through the process * page tables. The function will return with the folio locked and referenced. * * On fault, the device-exclusive entries are replaced with the original PTE * under folio lock, after calling MMU notifiers. * * Only anonymous non-hugetlb folios are supported and the VMA must have * write permissions such that we can fault in the anonymous page writable * in order to mark it exclusive. The caller must hold the mmap_lock in read * mode. * * A driver using this to program access from a device must use a mmu notifier * critical section to hold a device specific lock during programming. Once * programming is complete it should drop the folio lock and reference after * which point CPU access to the page will revoke the exclusive access. * * Notes: * #. This function always operates on individual PTEs mapping individual * pages. PMD-sized THPs are first remapped to be mapped by PTEs before * the conversion happens on a single PTE corresponding to @addr. * #. While concurrent access through the process page tables is prevented, * concurrent access through other page references (e.g., earlier GUP * invocation) is not handled and not supported. * #. device-exclusive entries are considered "clean" and "old" by core-mm. * Device drivers must update the folio state when informed by MMU * notifiers. * * Returns: pointer to mapped page on success, otherwise a negative error. */ struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr, void *owner, struct folio **foliop) { struct mmu_notifier_range range; struct folio *folio, *fw_folio; struct vm_area_struct *vma; struct folio_walk fw; struct page *page; swp_entry_t entry; pte_t swp_pte; int ret; mmap_assert_locked(mm); addr = PAGE_ALIGN_DOWN(addr); /* * Fault in the page writable and try to lock it; note that if the * address would already be marked for exclusive use by a device, * the GUP call would undo that first by triggering a fault. * * If any other device would already map this page exclusively, the * fault will trigger a conversion to an ordinary * (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE. */ retry: page = get_user_page_vma_remote(mm, addr, FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD, &vma); if (IS_ERR(page)) return page; folio = page_folio(page); if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) { folio_put(folio); return ERR_PTR(-EOPNOTSUPP); } ret = folio_lock_killable(folio); if (ret) { folio_put(folio); return ERR_PTR(ret); } /* * Inform secondary MMUs that we are going to convert this PTE to * device-exclusive, such that they unmap it now. Note that the * caller must filter this event out to prevent livelocks. */ mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, mm, addr, addr + PAGE_SIZE, owner); mmu_notifier_invalidate_range_start(&range); /* * Let's do a second walk and make sure we still find the same page * mapped writable. Note that any page of an anonymous folio can * only be mapped writable using exactly one PTE ("exclusive"), so * there cannot be other mappings. */ fw_folio = folio_walk_start(&fw, vma, addr, 0); if (fw_folio != folio || fw.page != page || fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) { if (fw_folio) folio_walk_end(&fw, vma); mmu_notifier_invalidate_range_end(&range); folio_unlock(folio); folio_put(folio); goto retry; } /* Nuke the page table entry so we get the uptodate dirty bit. */ flush_cache_page(vma, addr, page_to_pfn(page)); fw.pte = ptep_clear_flush(vma, addr, fw.ptep); /* Set the dirty flag on the folio now the PTE is gone. */ if (pte_dirty(fw.pte)) folio_mark_dirty(folio); /* * Store the pfn of the page in a special device-exclusive PFN swap PTE. * do_swap_page() will trigger the conversion back while holding the * folio lock. */ entry = make_device_exclusive_entry(page_to_pfn(page)); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(fw.pte)) swp_pte = pte_swp_mksoft_dirty(swp_pte); /* The pte is writable, uffd-wp does not apply. */ set_pte_at(mm, addr, fw.ptep, swp_pte); folio_walk_end(&fw, vma); mmu_notifier_invalidate_range_end(&range); *foliop = folio; return page; } EXPORT_SYMBOL_GPL(make_device_exclusive); #endif void __put_anon_vma(struct anon_vma *anon_vma) { struct anon_vma *root = anon_vma->root; anon_vma_free(anon_vma); if (root != anon_vma && atomic_dec_and_test(&root->refcount)) anon_vma_free(root); } static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; if (rwc->anon_lock) return rwc->anon_lock(folio, rwc); /* * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read() * because that depends on page_mapped(); but not all its usages * are holding mmap_lock. Users without mmap_lock are required to * take a reference count to prevent the anon_vma disappearing */ anon_vma = folio_anon_vma(folio); if (!anon_vma) return NULL; if (anon_vma_trylock_read(anon_vma)) goto out; if (rwc->try_lock) { anon_vma = NULL; rwc->contended = true; goto out; } anon_vma_lock_read(anon_vma); out: return anon_vma; } /* * rmap_walk_anon - do something to anonymous page using the object-based * rmap method * @folio: the folio to be handled * @rwc: control variable according to each walk type * @locked: caller holds relevant rmap lock * * Find all the mappings of a folio using the mapping pointer and the vma * chains contained in the anon_vma struct it points to. */ static void rmap_walk_anon(struct folio *folio, struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; if (locked) { anon_vma = folio_anon_vma(folio); /* anon_vma disappear under us? */ VM_BUG_ON_FOLIO(!anon_vma, folio); } else { anon_vma = rmap_walk_anon_lock(folio, rwc); } if (!anon_vma) return; pgoff_start = folio_pgoff(folio); pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff_start, pgoff_end) { struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(vma, pgoff_start, folio_nr_pages(folio)); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(folio, vma, address, rwc->arg)) break; if (rwc->done && rwc->done(folio)) break; } if (!locked) anon_vma_unlock_read(anon_vma); } /** * __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping * of a page mapped within a specified page cache object at a specified offset. * * @folio: Either the folio whose mappings to traverse, or if NULL, * the callbacks specified in @rwc will be configured such * as to be able to look up mappings correctly. * @mapping: The page cache object whose mapping VMAs we intend to * traverse. If @folio is non-NULL, this should be equal to * folio_mapping(folio). * @pgoff_start: The offset within @mapping of the page which we are * looking up. If @folio is non-NULL, this should be equal * to folio_pgoff(folio). * @nr_pages: The number of pages mapped by the mapping. If @folio is * non-NULL, this should be equal to folio_nr_pages(folio). * @rwc: The reverse mapping walk control object describing how * the traversal should proceed. * @locked: Is the @mapping already locked? If not, we acquire the * lock. */ static void __rmap_walk_file(struct folio *folio, struct address_space *mapping, pgoff_t pgoff_start, unsigned long nr_pages, struct rmap_walk_control *rwc, bool locked) { pgoff_t pgoff_end = pgoff_start + nr_pages - 1; struct vm_area_struct *vma; VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio); VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio); VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio); if (!locked) { if (i_mmap_trylock_read(mapping)) goto lookup; if (rwc->try_lock) { rwc->contended = true; return; } i_mmap_lock_read(mapping); } lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { unsigned long address = vma_address(vma, pgoff_start, nr_pages); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(folio, vma, address, rwc->arg)) goto done; if (rwc->done && rwc->done(folio)) goto done; } done: if (!locked) i_mmap_unlock_read(mapping); } /* * rmap_walk_file - do something to file page using the object-based rmap method * @folio: the folio to be handled * @rwc: control variable according to each walk type * @locked: caller holds relevant rmap lock * * Find all the mappings of a folio using the mapping pointer and the vma chains * contained in the address_space struct it points to. */ static void rmap_walk_file(struct folio *folio, struct rmap_walk_control *rwc, bool locked) { /* * The folio lock not only makes sure that folio->mapping cannot * suddenly be NULLified by truncation, it makes sure that the structure * at mapping cannot be freed and reused yet, so we can safely take * mapping->i_mmap_rwsem. */ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!folio->mapping) return; __rmap_walk_file(folio, folio->mapping, folio->index, folio_nr_pages(folio), rwc, locked); } void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) { if (unlikely(folio_test_ksm(folio))) rmap_walk_ksm(folio, rwc); else if (folio_test_anon(folio)) rmap_walk_anon(folio, rwc, false); else rmap_walk_file(folio, rwc, false); } /* Like rmap_walk, but caller holds relevant rmap lock */ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) { /* no ksm support for now */ VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio); if (folio_test_anon(folio)) rmap_walk_anon(folio, rwc, true); else rmap_walk_file(folio, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE /* * The following two functions are for anonymous (private mapped) hugepages. * Unlike common anonymous pages, anonymous hugepages have no accounting code * and no lru code, because we handle hugepages differently from common pages. */ void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); atomic_inc(&folio->_entire_mapcount); atomic_inc(&folio->_large_mapcount); if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(&folio->page); VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 && PageAnonExclusive(&folio->page), folio); } void hugetlb_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address) { VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); BUG_ON(address < vma->vm_start || address >= vma->vm_end); /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); atomic_set(&folio->_large_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); __folio_set_anon(folio, vma, address, true); SetPageAnonExclusive(&folio->page); } #endif /* CONFIG_HUGETLB_PAGE */ |
| 17 2 10 6 4 2 3 7 4 36 29 7 1 36 6 64 68 68 2 16 13 28 26 1 1 25 1 1 21 25 2 23 1 20 2 52 52 52 11 3 2 1 34 1 44 1 25 5 6 28 28 924 929 159 773 92 2 2 2 3 2 2 2 2 2 2 2 1 2 3 1 3 3 3 1 4 1 3 1 5 2 3 1 5 1 19 1 1 2 4 2 1 2 3 2 1 1 1 466 1 2 1 8 4 6 3 1 2 4 2 2 3 1 3 1 3 2 2 1 1 2 1 2 1 2 2 1 2 1 3 1 1 3 2 3 2 3 3 1 3 1 1 1 1 4 2 4 1 2 2 2 2 1 2 1 4 1 1 15 8 2 1 2 2 2 1 36 4 2 25 26 2 5 5 127 1 33 2 1009 39 45 672 258 9 1 2 2 2 2 4 4 15 1 14 1 6 8 7 169 170 10 160 39 7 3 3 15 3 1 29 10 26 13 29 10 27 12 28 11 32 7 1 1 1 1 1 1 1 1 1 9 1 8 2 1 1 1 1 1 1 1 1 3 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 3 1 1 2 2 2 1 1 1 43 42 213 7 36 135 2 22 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 | // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 BSD socket options interface * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on linux/net/ipv4/ip_sockglue.c * * FIXME: Make the setsockopt code POSIX compliant: That is * * o Truncate getsockopt returns * o Return an optlen of the truncated length if need be * * Changes: * David L Stevens <dlstevens@us.ibm.com>: * - added multicast source filtering API for MLDv2 */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/mroute6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/sysctl.h> #include <linux/netfilter.h> #include <linux/slab.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> #include <net/xfrm.h> #include <net/compat.h> #include <net/seg6.h> #include <net/psp.h> #include <linux/uaccess.h> struct ip6_ra_chain *ip6_ra_chain; DEFINE_RWLOCK(ip6_ra_lock); DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount); int ip6_ra_control(struct sock *sk, int sel) { struct ip6_ra_chain *ra, *new_ra, **rap; /* RA packet may be delivered ONLY to IPPROTO_RAW socket */ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_RAW) return -ENOPROTOOPT; new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; if (sel >= 0 && !new_ra) return -ENOMEM; write_lock_bh(&ip6_ra_lock); for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (sel >= 0) { write_unlock_bh(&ip6_ra_lock); kfree(new_ra); return -EADDRINUSE; } *rap = ra->next; write_unlock_bh(&ip6_ra_lock); sock_put(sk); kfree(ra); return 0; } } if (!new_ra) { write_unlock_bh(&ip6_ra_lock); return -ENOBUFS; } new_ra->sk = sk; new_ra->sel = sel; new_ra->next = ra; *rap = new_ra; sock_hold(sk); write_unlock_bh(&ip6_ra_lock); return 0; } struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt) { if (inet_test_bit(IS_ICSK, sk)) { if (opt && !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ext_hdr_len = psp_sk_overhead(sk) + opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = unrcu_pointer(xchg(&inet6_sk(sk)->opt, RCU_INITIALIZER(opt))); sk_dst_reset(sk); return opt; } static int copy_group_source_from_sockptr(struct group_source_req *greqs, sockptr_t optval, int optlen) { if (in_compat_syscall()) { struct compat_group_source_req gr32; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; greqs->gsr_interface = gr32.gsr_interface; greqs->gsr_group = gr32.gsr_group; greqs->gsr_source = gr32.gsr_source; } else { if (optlen < sizeof(*greqs)) return -EINVAL; if (copy_from_sockptr(greqs, optval, sizeof(*greqs))) return -EFAULT; } return 0; } static int do_ipv6_mcast_group_source(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct group_source_req greqs; int omode, add; int ret; ret = copy_group_source_from_sockptr(&greqs, optval, optlen); if (ret) return ret; if (greqs.gsr_group.ss_family != AF_INET6 || greqs.gsr_source.ss_family != AF_INET6) return -EADDRNOTAVAIL; if (optname == MCAST_BLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 1; } else if (optname == MCAST_UNBLOCK_SOURCE) { omode = MCAST_EXCLUDE; add = 0; } else if (optname == MCAST_JOIN_SOURCE_GROUP) { struct sockaddr_in6 *psin6; int retv; psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, &psin6->sin6_addr, MCAST_INCLUDE); /* prior join w/ different source is ok */ if (retv && retv != -EADDRINUSE) return retv; omode = MCAST_INCLUDE; add = 1; } else /* MCAST_LEAVE_SOURCE_GROUP */ { omode = MCAST_INCLUDE; add = 0; } return ip6_mc_source(add, omode, sk, &greqs); } static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { struct group_filter *gsf; int ret; if (optlen < GROUP_FILTER_SIZE(0)) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) return -ENOBUFS; gsf = memdup_sockptr(optval, optlen); if (IS_ERR(gsf)) return PTR_ERR(gsf); /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; if (gsf->gf_numsrc >= 0x1ffffffU || gsf->gf_numsrc > sysctl_mld_max_msf) goto out_free_gsf; ret = -EINVAL; if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) goto out_free_gsf; ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return ret; } static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; void *p; int ret; int n; if (optlen < size0) return -EINVAL; if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4) return -ENOBUFS; p = kmalloc(optlen + 4, GFP_KERNEL); if (!p) return -ENOMEM; gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ ret = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) goto out_free_p; /* numsrc >= (4G-140)/128 overflow in 32 bits */ ret = -ENOBUFS; n = gf32->gf_numsrc; if (n >= 0x1ffffffU || n > sysctl_mld_max_msf) goto out_free_p; ret = -EINVAL; if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_p; ret = ip6_mc_msfilter(sk, &(struct group_filter){ .gf_interface = gf32->gf_interface, .gf_group = gf32->gf_group, .gf_fmode = gf32->gf_fmode, .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex); out_free_p: kfree(p); return ret; } static int ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct sockaddr_in6 *psin6; struct group_req greq; if (optlen < sizeof(greq)) return -EINVAL; if (copy_from_sockptr(&greq, optval, sizeof(greq))) return -EFAULT; if (greq.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&greq.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, greq.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr); } static int compat_ipv6_mcast_join_leave(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct compat_group_req gr32; struct sockaddr_in6 *psin6; if (optlen < sizeof(gr32)) return -EINVAL; if (copy_from_sockptr(&gr32, optval, sizeof(gr32))) return -EFAULT; if (gr32.gr_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; psin6 = (struct sockaddr_in6 *)&gr32.gr_group; if (optname == MCAST_JOIN_GROUP) return ipv6_sock_mc_join(sk, gr32.gr_interface, &psin6->sin6_addr); return ipv6_sock_mc_drop(sk, gr32.gr_interface, &psin6->sin6_addr); } static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_opt_hdr *new = NULL; struct net *net = sock_net(sk); struct ipv6_txoptions *opt; int err; /* hop-by-hop / destination options are privileged option */ if (optname != IPV6_RTHDR && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; /* remove any sticky options header with a zero option * length, per RFC3542. */ if (optlen > 0) { if (sockptr_is_null(optval)) return -EINVAL; if (optlen < sizeof(struct ipv6_opt_hdr) || optlen & 0x7 || optlen > 8 * 255) return -EINVAL; new = memdup_sockptr(optval, optlen); if (IS_ERR(new)) return PTR_ERR(new); if (unlikely(ipv6_optlen(new) > optlen)) { kfree(new); return -EINVAL; } } opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); opt = ipv6_renew_options(sk, opt, optname, new); kfree(new); if (IS_ERR(opt)) return PTR_ERR(opt); /* routing header option needs extra check */ err = -EINVAL; if (optname == IPV6_RTHDR && opt && opt->srcrt) { struct ipv6_rt_hdr *rthdr = opt->srcrt; switch (rthdr->type) { #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPV6_SRCRT_TYPE_2: if (rthdr->hdrlen != 2 || rthdr->segments_left != 1) goto sticky_done; break; #endif case IPV6_SRCRT_TYPE_4: { struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)opt->srcrt; if (!seg6_validate_srh(srh, optlen, false)) goto sticky_done; break; } default: goto sticky_done; } } err = 0; opt = ipv6_update_options(sk, opt); sticky_done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } return err; } int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int retv = -ENOPROTOOPT; int val, valbool; if (sockptr_is_null(optval)) val = 0; else { if (optlen >= sizeof(int)) { if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; } else val = 0; } valbool = (val != 0); if (ip6_mroute_opt(optname)) return ip6_mroute_setsockopt(sk, optname, optval, optlen); /* Handle options that can be set without locking the socket. */ switch (optname) { case IPV6_UNICAST_HOPS: if (optlen < sizeof(int)) return -EINVAL; if (val > 255 || val < -1) return -EINVAL; WRITE_ONCE(np->hop_limit, val); return 0; case IPV6_MULTICAST_LOOP: if (optlen < sizeof(int)) return -EINVAL; if (val != valbool) return -EINVAL; inet6_assign_bit(MC6_LOOP, sk, valbool); return 0; case IPV6_MULTICAST_HOPS: if (sk->sk_type == SOCK_STREAM) return retv; if (optlen < sizeof(int)) return -EINVAL; if (val > 255 || val < -1) return -EINVAL; WRITE_ONCE(np->mcast_hops, val == -1 ? IPV6_DEFAULT_MCASTHOPS : val); return 0; case IPV6_MTU: if (optlen < sizeof(int)) return -EINVAL; if (val && val < IPV6_MIN_MTU) return -EINVAL; WRITE_ONCE(np->frag_size, val); return 0; case IPV6_MINHOPCOUNT: if (optlen < sizeof(int)) return -EINVAL; if (val < 0 || val > 255) return -EINVAL; if (val) static_branch_enable(&ip6_min_hopcount); /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount * while we are changing it. */ WRITE_ONCE(np->min_hopcount, val); return 0; case IPV6_RECVERR_RFC4884: if (optlen < sizeof(int)) return -EINVAL; if (val < 0 || val > 1) return -EINVAL; inet6_assign_bit(RECVERR6_RFC4884, sk, valbool); return 0; case IPV6_MULTICAST_ALL: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(MC6_ALL, sk, valbool); return 0; case IPV6_AUTOFLOWLABEL: inet6_assign_bit(AUTOFLOWLABEL, sk, valbool); inet6_set_bit(AUTOFLOWLABEL_SET, sk); return 0; case IPV6_DONTFRAG: inet6_assign_bit(DONTFRAG, sk, valbool); return 0; case IPV6_RECVERR: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(RECVERR6, sk, valbool); if (!val) skb_errqueue_purge(&sk->sk_error_queue); return 0; case IPV6_ROUTER_ALERT_ISOLATE: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(RTALERT_ISOLATE, sk, valbool); return 0; case IPV6_MTU_DISCOVER: if (optlen < sizeof(int)) return -EINVAL; if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT) return -EINVAL; WRITE_ONCE(np->pmtudisc, val); return 0; case IPV6_FLOWINFO_SEND: if (optlen < sizeof(int)) return -EINVAL; inet6_assign_bit(SNDFLOW, sk, valbool); return 0; case IPV6_ADDR_PREFERENCES: if (optlen < sizeof(int)) return -EINVAL; return ip6_sock_set_addr_preferences(sk, val); case IPV6_MULTICAST_IF: if (sk->sk_type == SOCK_STREAM) return -ENOPROTOOPT; if (optlen < sizeof(int)) return -EINVAL; if (val) { struct net_device *dev; int bound_dev_if, midx; rcu_read_lock(); dev = dev_get_by_index_rcu(net, val); if (!dev) { rcu_read_unlock(); return -ENODEV; } midx = l3mdev_master_ifindex_rcu(dev); rcu_read_unlock(); bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && bound_dev_if != val && (!midx || midx != bound_dev_if)) return -EINVAL; } WRITE_ONCE(np->mcast_oif, val); return 0; case IPV6_UNICAST_IF: { struct net_device *dev; int ifindex; if (optlen != sizeof(int)) return -EINVAL; ifindex = (__force int)ntohl((__force __be32)val); if (!ifindex) { WRITE_ONCE(np->ucast_oif, 0); return 0; } dev = dev_get_by_index(net, ifindex); if (!dev) return -EADDRNOTAVAIL; dev_put(dev); if (READ_ONCE(sk->sk_bound_dev_if)) return -EINVAL; WRITE_ONCE(np->ucast_oif, ifindex); return 0; } } sockopt_lock_sock(sk); /* Another thread has converted the socket into IPv4 with * IPV6_ADDRFORM concurrently. */ if (unlikely(sk->sk_family != AF_INET6)) goto unlock; switch (optname) { case IPV6_ADDRFORM: if (optlen < sizeof(int)) goto e_inval; if (val == PF_INET) { if (sk->sk_type == SOCK_RAW) break; if (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_UDPLITE) { struct udp_sock *up = udp_sk(sk); if (up->pending == AF_INET6) { retv = -EBUSY; break; } } else if (sk->sk_protocol == IPPROTO_TCP) { if (sk->sk_prot != &tcpv6_prot) { retv = -EBUSY; break; } } else { break; } if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; } if (ipv6_only_sock(sk) || !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { retv = -EADDRNOTAVAIL; break; } __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); if (sk->sk_protocol == IPPROTO_TCP) { struct inet_connection_sock *icsk = inet_csk(sk); sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */ WRITE_ONCE(sk->sk_prot, &tcp_prot); /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); WRITE_ONCE(sk->sk_socket->ops, &inet_stream_ops); WRITE_ONCE(sk->sk_family, PF_INET); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { struct proto *prot = &udp_prot; if (sk->sk_protocol == IPPROTO_UDPLITE) prot = &udplite_prot; sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); WRITE_ONCE(sk->sk_socket->ops, &inet_dgram_ops); WRITE_ONCE(sk->sk_family, PF_INET); } /* Disable all options not to allocate memory anymore, * but there is still a race. See the lockless path * in udpv6_sendmsg() and ipv6_local_rxpmtu(). */ np->rxopt.all = 0; inet6_cleanup_sock(sk); module_put(THIS_MODULE); retv = 0; break; } goto e_inval; case IPV6_V6ONLY: if (optlen < sizeof(int) || inet_sk(sk)->inet_num) goto e_inval; sk->sk_ipv6only = valbool; retv = 0; break; case IPV6_RECVPKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxinfo = valbool; retv = 0; break; case IPV6_2292PKTINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxoinfo = valbool; retv = 0; break; case IPV6_RECVHOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxhlim = valbool; retv = 0; break; case IPV6_2292HOPLIMIT: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxohlim = valbool; retv = 0; break; case IPV6_RECVRTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.srcrt = valbool; retv = 0; break; case IPV6_2292RTHDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.osrcrt = valbool; retv = 0; break; case IPV6_RECVHOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.hopopts = valbool; retv = 0; break; case IPV6_2292HOPOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.ohopopts = valbool; retv = 0; break; case IPV6_RECVDSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.dstopts = valbool; retv = 0; break; case IPV6_2292DSTOPTS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.odstopts = valbool; retv = 0; break; case IPV6_TCLASS: if (optlen < sizeof(int)) goto e_inval; if (val < -1 || val > 0xff) goto e_inval; /* RFC 3542, 6.5: default traffic class of 0x0 */ if (val == -1) val = 0; if (sk->sk_type == SOCK_STREAM) { val &= ~INET_ECN_MASK; val |= np->tclass & INET_ECN_MASK; } if (np->tclass != val) { np->tclass = val; sk_dst_reset(sk); } retv = 0; break; case IPV6_RECVTCLASS: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxtclass = valbool; retv = 0; break; case IPV6_FLOWINFO: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxflow = valbool; retv = 0; break; case IPV6_RECVPATHMTU: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxpmtu = valbool; retv = 0; break; case IPV6_TRANSPARENT: if (valbool && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW) && !sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) { retv = -EPERM; break; } if (optlen < sizeof(int)) goto e_inval; /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ inet_assign_bit(TRANSPARENT, sk, valbool); retv = 0; break; case IPV6_FREEBIND: if (optlen < sizeof(int)) goto e_inval; /* we also don't have a separate freebind bit for IPV6 */ inet_assign_bit(FREEBIND, sk, valbool); retv = 0; break; case IPV6_RECVORIGDSTADDR: if (optlen < sizeof(int)) goto e_inval; np->rxopt.bits.rxorigdstaddr = valbool; retv = 0; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: retv = ipv6_set_opt_hdr(sk, optname, optval, optlen); break; case IPV6_PKTINFO: { struct in6_pktinfo pkt; if (optlen == 0) goto e_inval; else if (optlen < sizeof(struct in6_pktinfo) || sockptr_is_null(optval)) goto e_inval; if (copy_from_sockptr(&pkt, optval, sizeof(pkt))) { retv = -EFAULT; break; } if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex)) goto e_inval; np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex; np->sticky_pktinfo.ipi6_addr = pkt.ipi6_addr; retv = 0; break; } case IPV6_2292PKTOPTIONS: { struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; if (optlen == 0) goto update; /* 1K is probably excessive * 1K is surely not enough, 2K per standard header is 16K. */ retv = -EINVAL; if (optlen > 64*1024) break; opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); retv = -ENOBUFS; if (!opt) break; memset(opt, 0, sizeof(*opt)); refcount_set(&opt->refcnt, 1); opt->tot_len = sizeof(*opt) + optlen; retv = -EFAULT; if (copy_from_sockptr(opt + 1, optval, optlen)) goto done; msg.msg_controllen = optlen; msg.msg_control_is_user = false; msg.msg_control = (void *)(opt+1); ipc6.opt = opt; retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6); if (retv) goto done; update: retv = 0; opt = ipv6_update_options(sk, opt); done: if (opt) { atomic_sub(opt->tot_len, &sk->sk_omem_alloc); txopt_put(opt); } break; } case IPV6_ADD_MEMBERSHIP: case IPV6_DROP_MEMBERSHIP: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EPROTO; if (inet_test_bit(IS_ICSK, sk)) break; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_ADD_MEMBERSHIP) retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); else retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); break; } case IPV6_JOIN_ANYCAST: case IPV6_LEAVE_ANYCAST: { struct ipv6_mreq mreq; if (optlen < sizeof(struct ipv6_mreq)) goto e_inval; retv = -EFAULT; if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq))) break; if (optname == IPV6_JOIN_ANYCAST) retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); else retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); break; } case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: if (in_compat_syscall()) retv = compat_ipv6_mcast_join_leave(sk, optname, optval, optlen); else retv = ipv6_mcast_join_leave(sk, optname, optval, optlen); break; case MCAST_JOIN_SOURCE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_BLOCK_SOURCE: case MCAST_UNBLOCK_SOURCE: retv = do_ipv6_mcast_group_source(sk, optname, optval, optlen); break; case MCAST_MSFILTER: if (in_compat_syscall()) retv = compat_ipv6_set_mcast_msfilter(sk, optval, optlen); else retv = ipv6_set_mcast_msfilter(sk, optval, optlen); break; case IPV6_ROUTER_ALERT: if (optlen < sizeof(int)) goto e_inval; retv = ip6_ra_control(sk, val); if (retv == 0) inet6_assign_bit(RTALERT, sk, valbool); break; case IPV6_FLOWLABEL_MGR: retv = ipv6_flowlabel_opt(sk, optval, optlen); break; case IPV6_IPSEC_POLICY: case IPV6_XFRM_POLICY: retv = -EPERM; if (!sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) break; retv = xfrm_user_policy(sk, optname, optval, optlen); break; case IPV6_RECVFRAGSIZE: np->rxopt.bits.recvfragsize = valbool; retv = 0; break; } unlock: sockopt_release_sock(sk); return retv; e_inval: retv = -EINVAL; goto unlock; } int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return ip_setsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY && optname != IPV6_XFRM_POLICY) err = nf_setsockopt(sk, PF_INET6, optname, optval, optlen); #endif return err; } EXPORT_SYMBOL(ipv6_setsockopt); static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, int optname, sockptr_t optval, int len) { struct ipv6_opt_hdr *hdr; if (!opt) return 0; switch (optname) { case IPV6_HOPOPTS: hdr = opt->hopopt; break; case IPV6_RTHDRDSTOPTS: hdr = opt->dst0opt; break; case IPV6_RTHDR: hdr = (struct ipv6_opt_hdr *)opt->srcrt; break; case IPV6_DSTOPTS: hdr = opt->dst1opt; break; default: return -EINVAL; /* should not happen */ } if (!hdr) return 0; len = min_t(unsigned int, len, ipv6_optlen(hdr)); if (copy_to_sockptr(optval, hdr, len)) return -EFAULT; return len; } static int ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter gsf; int num; int err; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; if (gsf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; num = gsf.gf_numsrc; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gsf, optval, size0); if (!err) { if (num > gsf.gf_numsrc) num = gsf.gf_numsrc; len = GROUP_FILTER_SIZE(num); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr(optval, &gsf, size0)) err = -EFAULT; } sockopt_release_sock(sk); return err; } static int compat_ipv6_get_msfilter(struct sock *sk, sockptr_t optval, sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter gf32; struct group_filter gf; int err; int num; if (len < size0) return -EINVAL; if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; gf.gf_fmode = gf32.gf_fmode; num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; if (gf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; sockopt_lock_sock(sk); err = ip6_mc_msfget(sk, &gf, optval, size0); sockopt_release_sock(sk); if (err) return err; if (num > gf.gf_numsrc) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); if (copy_to_sockptr(optlen, &len, sizeof(int)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), &gf.gf_fmode, sizeof(gf32.gf_fmode)) || copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), &gf.gf_numsrc, sizeof(gf32.gf_numsrc))) return -EFAULT; return 0; } int do_ipv6_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct ipv6_pinfo *np = inet6_sk(sk); int len; int val; if (ip6_mroute_opt(optname)) return ip6_mroute_getsockopt(sk, optname, optval, optlen); if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; switch (optname) { case IPV6_ADDRFORM: if (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_UDPLITE && sk->sk_protocol != IPPROTO_TCP) return -ENOPROTOOPT; if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; val = sk->sk_family; break; case MCAST_MSFILTER: if (in_compat_syscall()) return compat_ipv6_get_msfilter(sk, optval, optlen, len); return ipv6_get_msfilter(sk, optval, optlen, len); case IPV6_2292PKTOPTIONS: { struct msghdr msg; struct sk_buff *skb; if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; if (optval.is_kernel) { msg.msg_control_is_user = false; msg.msg_control = optval.kernel; } else { msg.msg_control_is_user = true; msg.msg_control_user = optval.user; } msg.msg_controllen = len; msg.msg_flags = 0; sockopt_lock_sock(sk); skb = np->pktoptions; if (skb) ip6_datagram_recv_ctl(sk, &msg, skb); sockopt_release_sock(sk); if (!skb) { if (np->rxopt.bits.rxinfo) { int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxhlim) { int hlim = READ_ONCE(np->mcast_hops); put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxtclass) { int tclass = (int)ip6_tclass(np->rcv_flowinfo); put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass); } if (np->rxopt.bits.rxoinfo) { int mcast_oif = READ_ONCE(np->mcast_oif); struct in6_pktinfo src_info; src_info.ipi6_ifindex = mcast_oif ? : np->sticky_pktinfo.ipi6_ifindex; src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr; put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info); } if (np->rxopt.bits.rxohlim) { int hlim = READ_ONCE(np->mcast_hops); put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } if (np->rxopt.bits.rxflow) { __be32 flowinfo = np->rcv_flowinfo; put_cmsg(&msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); } } len -= msg.msg_controllen; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_MTU: { struct dst_entry *dst; val = 0; rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = dst_mtu(dst); rcu_read_unlock(); if (!val) return -ENOTCONN; break; } case IPV6_V6ONLY: val = sk->sk_ipv6only; break; case IPV6_RECVPKTINFO: val = np->rxopt.bits.rxinfo; break; case IPV6_2292PKTINFO: val = np->rxopt.bits.rxoinfo; break; case IPV6_RECVHOPLIMIT: val = np->rxopt.bits.rxhlim; break; case IPV6_2292HOPLIMIT: val = np->rxopt.bits.rxohlim; break; case IPV6_RECVRTHDR: val = np->rxopt.bits.srcrt; break; case IPV6_2292RTHDR: val = np->rxopt.bits.osrcrt; break; case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: case IPV6_DSTOPTS: { struct ipv6_txoptions *opt; sockopt_lock_sock(sk); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); sockopt_release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ if (len < 0) return len; return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_RECVHOPOPTS: val = np->rxopt.bits.hopopts; break; case IPV6_2292HOPOPTS: val = np->rxopt.bits.ohopopts; break; case IPV6_RECVDSTOPTS: val = np->rxopt.bits.dstopts; break; case IPV6_2292DSTOPTS: val = np->rxopt.bits.odstopts; break; case IPV6_TCLASS: val = np->tclass; break; case IPV6_RECVTCLASS: val = np->rxopt.bits.rxtclass; break; case IPV6_FLOWINFO: val = np->rxopt.bits.rxflow; break; case IPV6_RECVPATHMTU: val = np->rxopt.bits.rxpmtu; break; case IPV6_PATHMTU: { struct dst_entry *dst; struct ip6_mtuinfo mtuinfo; if (len < sizeof(mtuinfo)) return -EINVAL; len = sizeof(mtuinfo); memset(&mtuinfo, 0, sizeof(mtuinfo)); rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) mtuinfo.ip6m_mtu = dst_mtu(dst); rcu_read_unlock(); if (!mtuinfo.ip6m_mtu) return -ENOTCONN; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &mtuinfo, len)) return -EFAULT; return 0; } case IPV6_TRANSPARENT: val = inet_test_bit(TRANSPARENT, sk); break; case IPV6_FREEBIND: val = inet_test_bit(FREEBIND, sk); break; case IPV6_RECVORIGDSTADDR: val = np->rxopt.bits.rxorigdstaddr; break; case IPV6_UNICAST_HOPS: case IPV6_MULTICAST_HOPS: { struct dst_entry *dst; if (optname == IPV6_UNICAST_HOPS) val = READ_ONCE(np->hop_limit); else val = READ_ONCE(np->mcast_hops); if (val < 0) { rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) val = ip6_dst_hoplimit(dst); rcu_read_unlock(); } if (val < 0) val = READ_ONCE(sock_net(sk)->ipv6.devconf_all->hop_limit); break; } case IPV6_MULTICAST_LOOP: val = inet6_test_bit(MC6_LOOP, sk); break; case IPV6_MULTICAST_IF: val = READ_ONCE(np->mcast_oif); break; case IPV6_MULTICAST_ALL: val = inet6_test_bit(MC6_ALL, sk); break; case IPV6_UNICAST_IF: val = (__force int)htonl((__u32) READ_ONCE(np->ucast_oif)); break; case IPV6_MTU_DISCOVER: val = READ_ONCE(np->pmtudisc); break; case IPV6_RECVERR: val = inet6_test_bit(RECVERR6, sk); break; case IPV6_FLOWINFO_SEND: val = inet6_test_bit(SNDFLOW, sk); break; case IPV6_FLOWLABEL_MGR: { struct in6_flowlabel_req freq; int flags; if (len < sizeof(freq)) return -EINVAL; if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; if (freq.flr_action != IPV6_FL_A_GET) return -EINVAL; len = sizeof(freq); flags = freq.flr_flags; memset(&freq, 0, sizeof(freq)); val = ipv6_flowlabel_opt_get(sk, &freq, flags); if (val < 0) return val; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &freq, len)) return -EFAULT; return 0; } case IPV6_ADDR_PREFERENCES: { u8 srcprefs = READ_ONCE(np->srcprefs); val = 0; if (srcprefs & IPV6_PREFER_SRC_TMP) val |= IPV6_PREFER_SRC_TMP; else if (srcprefs & IPV6_PREFER_SRC_PUBLIC) val |= IPV6_PREFER_SRC_PUBLIC; else { /* XXX: should we return system default? */ val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT; } if (srcprefs & IPV6_PREFER_SRC_COA) val |= IPV6_PREFER_SRC_COA; else val |= IPV6_PREFER_SRC_HOME; break; } case IPV6_MINHOPCOUNT: val = READ_ONCE(np->min_hopcount); break; case IPV6_DONTFRAG: val = inet6_test_bit(DONTFRAG, sk); break; case IPV6_AUTOFLOWLABEL: val = ip6_autoflowlabel(sock_net(sk), sk); break; case IPV6_RECVFRAGSIZE: val = np->rxopt.bits.recvfragsize; break; case IPV6_ROUTER_ALERT: val = inet6_test_bit(RTALERT, sk); break; case IPV6_ROUTER_ALERT_ISOLATE: val = inet6_test_bit(RTALERT_ISOLATE, sk); break; case IPV6_RECVERR_RFC4884: val = inet6_test_bit(RECVERR6_RFC4884, sk); break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { int err; if (level == SOL_IP && sk->sk_type != SOCK_RAW) return ip_getsockopt(sk, level, optname, optval, optlen); if (level != SOL_IPV6) return -ENOPROTOOPT; err = do_ipv6_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { int len; if (get_user(len, optlen)) return -EFAULT; err = nf_getsockopt(sk, PF_INET6, optname, optval, &len); if (err >= 0) err = put_user(len, optlen); } #endif return err; } EXPORT_SYMBOL(ipv6_getsockopt); |
| 11 11 11 11 11 10 11 11 9 2 11 15 15 15 11 7 11 81 18 18 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 | // SPDX-License-Identifier: GPL-2.0 /* * Functions to sequence PREFLUSH and FUA writes. * * Copyright (C) 2011 Max Planck Institute for Gravitational Physics * Copyright (C) 2011 Tejun Heo <tj@kernel.org> * * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request * properties and hardware capability. * * If a request doesn't have data, only REQ_PREFLUSH makes sense, which * indicates a simple flush request. If there is data, REQ_PREFLUSH indicates * that the device cache should be flushed before the data is executed, and * REQ_FUA means that the data must be on non-volatile media on request * completion. * * If the device doesn't have writeback cache, PREFLUSH and FUA don't make any * difference. The requests are either completed immediately if there's no data * or executed as normal requests otherwise. * * If the device has writeback cache and supports FUA, REQ_PREFLUSH is * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. * * If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH * is translated to PREFLUSH and REQ_FUA to POSTFLUSH. * * The actual execution of flush is double buffered. Whenever a request * needs to execute PRE or POSTFLUSH, it queues at * fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a * REQ_OP_FLUSH is issued and the pending_idx is toggled. When the flush * completes, all the requests which were pending are proceeded to the next * step. This allows arbitrary merging of different types of PREFLUSH/FUA * requests. * * Currently, the following conditions are used to determine when to issue * flush. * * C1. At any given time, only one flush shall be in progress. This makes * double buffering sufficient. * * C2. Flush is deferred if any request is executing DATA of its sequence. * This avoids issuing separate POSTFLUSHes for requests which shared * PREFLUSH. * * C3. The second condition is ignored if there is a request which has * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid * starvation in the unlikely case where there are continuous stream of * FUA (without PREFLUSH) requests. * * For devices which support FUA, it isn't clear whether C2 (and thus C3) * is beneficial. * * Note that a sequenced PREFLUSH/FUA request with DATA is completed twice. * Once while executing DATA and again after the whole sequence is * complete. The first completion updates the contained bio but doesn't * finish it so that the bio submitter is notified only after the whole * sequence is complete. This is implemented by testing RQF_FLUSH_SEQ in * req_bio_endio(). * * The above peculiarity requires that each PREFLUSH/FUA request has only one * bio attached to it, which is guaranteed as they aren't allowed to be * merged in the usual way. */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/gfp.h> #include <linux/part_stat.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" /* PREFLUSH/FUA sequences */ enum { REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */ REQ_FSEQ_DATA = (1 << 1), /* data write in progress */ REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */ REQ_FSEQ_DONE = (1 << 3), REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH, /* * If flush has been pending longer than the following timeout, * it's issued even if flush_data requests are still in flight. */ FLUSH_PENDING_TIMEOUT = 5 * HZ, }; static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_opf_t flags); static inline struct blk_flush_queue * blk_get_flush_queue(struct blk_mq_ctx *ctx) { return blk_mq_map_queue(REQ_OP_FLUSH, ctx)->fq; } static unsigned int blk_flush_cur_seq(struct request *rq) { return 1 << ffz(rq->flush.seq); } static void blk_flush_restore_request(struct request *rq) { /* * After flush data completion, @rq->bio is %NULL but we need to * complete the bio again. @rq->biotail is guaranteed to equal the * original @rq->bio. Restore it. */ rq->bio = rq->biotail; if (rq->bio) rq->__sector = rq->bio->bi_iter.bi_sector; /* make @rq a normal request */ rq->rq_flags &= ~RQF_FLUSH_SEQ; rq->end_io = rq->flush.saved_end_io; } static void blk_account_io_flush(struct request *rq) { struct block_device *part = rq->q->disk->part0; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); part_stat_add(part, nsecs[STAT_FLUSH], blk_time_get_ns() - rq->start_time_ns); part_stat_unlock(); } /** * blk_flush_complete_seq - complete flush sequence * @rq: PREFLUSH/FUA request being sequenced * @fq: flush queue * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) * @error: whether an error occurred * * @rq just completed @seq part of its flush sequence, record the * completion and trigger the next step. * * CONTEXT: * spin_lock_irq(fq->mq_flush_lock) */ static void blk_flush_complete_seq(struct request *rq, struct blk_flush_queue *fq, unsigned int seq, blk_status_t error) { struct request_queue *q = rq->q; struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; blk_opf_t cmd_flags; BUG_ON(rq->flush.seq & seq); rq->flush.seq |= seq; cmd_flags = rq->cmd_flags; if (likely(!error)) seq = blk_flush_cur_seq(rq); else seq = REQ_FSEQ_DONE; switch (seq) { case REQ_FSEQ_PREFLUSH: case REQ_FSEQ_POSTFLUSH: /* queue for flush */ if (list_empty(pending)) fq->flush_pending_since = jiffies; list_add_tail(&rq->queuelist, pending); break; case REQ_FSEQ_DATA: fq->flush_data_in_flight++; spin_lock(&q->requeue_lock); list_move(&rq->queuelist, &q->requeue_list); spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); break; case REQ_FSEQ_DONE: /* * @rq was previously adjusted by blk_insert_flush() for * flush sequencing and may already have gone through the * flush data request completion path. Restore @rq for * normal completion and end it. */ list_del_init(&rq->queuelist); blk_flush_restore_request(rq); blk_mq_end_request(rq, error); break; default: BUG(); } blk_kick_flush(q, fq, cmd_flags); } static enum rq_end_io_ret flush_end_io(struct request *flush_rq, blk_status_t error) { struct request_queue *q = flush_rq->q; struct list_head *running; struct request *rq, *n; unsigned long flags = 0; struct blk_flush_queue *fq = blk_get_flush_queue(flush_rq->mq_ctx); /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); if (!req_ref_put_and_test(flush_rq)) { fq->rq_status = error; spin_unlock_irqrestore(&fq->mq_flush_lock, flags); return RQ_END_IO_NONE; } blk_account_io_flush(flush_rq); /* * Flush request has to be marked as IDLE when it is really ended * because its .end_io() is called from timeout code path too for * avoiding use-after-free. */ WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); if (fq->rq_status != BLK_STS_OK) { error = fq->rq_status; fq->rq_status = BLK_STS_OK; } if (!q->elevator) { flush_rq->tag = BLK_MQ_NO_TAG; } else { blk_mq_put_driver_tag(flush_rq); flush_rq->internal_tag = BLK_MQ_NO_TAG; } running = &fq->flush_queue[fq->flush_running_idx]; BUG_ON(fq->flush_pending_idx == fq->flush_running_idx); /* account completion of the flush request */ fq->flush_running_idx ^= 1; /* and push the waiting requests to the next stage */ list_for_each_entry_safe(rq, n, running, queuelist) { unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); list_del_init(&rq->queuelist); blk_flush_complete_seq(rq, fq, seq, error); } spin_unlock_irqrestore(&fq->mq_flush_lock, flags); return RQ_END_IO_NONE; } bool is_flush_rq(struct request *rq) { return rq->end_io == flush_end_io; } /** * blk_kick_flush - consider issuing flush request * @q: request_queue being kicked * @fq: flush queue * @flags: cmd_flags of the original request * * Flush related states of @q have changed, consider issuing flush request. * Please read the comment at the top of this file for more info. * * CONTEXT: * spin_lock_irq(fq->mq_flush_lock) * */ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_opf_t flags) { struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = list_first_entry(pending, struct request, queuelist); struct request *flush_rq = fq->flush_rq; /* C1 described at the top of this file */ if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) return; /* C2 and C3 */ if (fq->flush_data_in_flight && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return; /* * Issue flush and toggle pending_idx. This makes pending_idx * different from running_idx, which means flush is in flight. */ fq->flush_pending_idx ^= 1; blk_rq_init(q, flush_rq); /* * In case of none scheduler, borrow tag from the first request * since they can't be in flight at the same time. And acquire * the tag's ownership for flush req. * * In case of IO scheduler, flush rq need to borrow scheduler tag * just for cheating put/get driver tag. */ flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->mq_hctx = first_rq->mq_hctx; if (!q->elevator) flush_rq->tag = first_rq->tag; else flush_rq->internal_tag = first_rq->internal_tag; flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->end_io = flush_end_io; /* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one * implied in refcount_inc_not_zero() called from * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref * and READ flush_rq->end_io */ smp_wmb(); req_ref_set(flush_rq, 1); spin_lock(&q->requeue_lock); list_add_tail(&flush_rq->queuelist, &q->flush_list); spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); } static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, blk_status_t error) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_mq_ctx *ctx = rq->mq_ctx; unsigned long flags; struct blk_flush_queue *fq = blk_get_flush_queue(ctx); if (q->elevator) { WARN_ON(rq->tag < 0); blk_mq_put_driver_tag(rq); } /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). */ spin_lock_irqsave(&fq->mq_flush_lock, flags); fq->flush_data_in_flight--; /* * May have been corrupted by rq->rq_next reuse, we need to * re-initialize rq->queuelist before reusing it here. */ INIT_LIST_HEAD(&rq->queuelist); blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); blk_mq_sched_restart(hctx); return RQ_END_IO_NONE; } static void blk_rq_init_flush(struct request *rq) { rq->flush.seq = 0; rq->rq_flags |= RQF_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ rq->end_io = mq_flush_data_end_io; } /* * Insert a PREFLUSH/FUA request into the flush state machine. * Returns true if the request has been consumed by the flush state machine, * or false if the caller should continue to process it. */ bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; struct blk_flush_queue *fq = blk_get_flush_queue(rq->mq_ctx); bool supports_fua = q->limits.features & BLK_FEAT_FUA; unsigned int policy = 0; /* FLUSH/FUA request must never be merged */ WARN_ON_ONCE(rq->bio != rq->biotail); if (blk_rq_sectors(rq)) policy |= REQ_FSEQ_DATA; /* * Check which flushes we need to sequence for this operation. */ if (blk_queue_write_cache(q)) { if (rq->cmd_flags & REQ_PREFLUSH) policy |= REQ_FSEQ_PREFLUSH; if ((rq->cmd_flags & REQ_FUA) && !supports_fua) policy |= REQ_FSEQ_POSTFLUSH; } /* * @policy now records what operations need to be done. Adjust * REQ_PREFLUSH and FUA for the driver. */ rq->cmd_flags &= ~REQ_PREFLUSH; if (!supports_fua) rq->cmd_flags &= ~REQ_FUA; /* * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any * of those flags, we have to set REQ_SYNC to avoid skewing * the request accounting. */ rq->cmd_flags |= REQ_SYNC; switch (policy) { case 0: /* * An empty flush handed down from a stacking driver may * translate into nothing if the underlying device does not * advertise a write-back cache. In this case, simply * complete the request. */ blk_mq_end_request(rq, 0); return true; case REQ_FSEQ_DATA: /* * If there's data, but no flush is necessary, the request can * be processed directly without going through flush machinery. * Queue for normal execution. */ return false; case REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH: /* * Initialize the flush fields and completion handler to trigger * the post flush, and then just pass the command on. */ blk_rq_init_flush(rq); rq->flush.seq |= REQ_FSEQ_PREFLUSH; spin_lock_irq(&fq->mq_flush_lock); fq->flush_data_in_flight++; spin_unlock_irq(&fq->mq_flush_lock); return false; default: /* * Mark the request as part of a flush sequence and submit it * for further processing to the flush state machine. */ blk_rq_init_flush(rq); spin_lock_irq(&fq->mq_flush_lock); blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); spin_unlock_irq(&fq->mq_flush_lock); return true; } } /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for * * Description: * Issue a flush for the block device in question. */ int blkdev_issue_flush(struct block_device *bdev) { struct bio bio; bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH); return submit_bio_wait(&bio); } EXPORT_SYMBOL(blkdev_issue_flush); struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, gfp_t flags) { struct blk_flush_queue *fq; int rq_sz = sizeof(struct request); fq = kzalloc_node(sizeof(*fq), flags, node); if (!fq) goto fail; spin_lock_init(&fq->mq_flush_lock); rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); fq->flush_rq = kzalloc_node(rq_sz, flags, node); if (!fq->flush_rq) goto fail_rq; INIT_LIST_HEAD(&fq->flush_queue[0]); INIT_LIST_HEAD(&fq->flush_queue[1]); return fq; fail_rq: kfree(fq); fail: return NULL; } void blk_free_flush_queue(struct blk_flush_queue *fq) { /* bio based request queue hasn't flush queue */ if (!fq) return; kfree(fq->flush_rq); kfree(fq); } /* * Allow driver to set its own lock class to fq->mq_flush_lock for * avoiding lockdep complaint. * * flush_end_io() may be called recursively from some driver, such as * nvme-loop, so lockdep may complain 'possible recursive locking' because * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class * key. We need to assign different lock class for these driver's * fq->mq_flush_lock for avoiding the lockdep warning. * * Use dynamically allocated lock class key for each 'blk_flush_queue' * instance is over-kill, and more worse it introduces horrible boot delay * issue because synchronize_rcu() is implied in lockdep_unregister_key which * is called for each hctx release. SCSI probing may synchronously create and * destroy lots of MQ request_queues for non-existent devices, and some robot * test kernel always enable lockdep option. It is observed that more than half * an hour is taken during SCSI MQ probe with per-fq lock class. */ void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, struct lock_class_key *key) { lockdep_set_class(&hctx->fq->mq_flush_lock, key); } EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class); |
| 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 | // SPDX-License-Identifier: GPL-2.0-only /* * OF helpers for the MDIO (Ethernet PHY) API * * Copyright (c) 2009 Secret Lab Technologies, Ltd. * * This file provides helper functions for extracting PHY device information * out of the OpenFirmware device tree and using it to populate an mii_bus. */ #include <linux/device.h> #include <linux/err.h> #include <linux/fwnode_mdio.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/of.h> #include <linux/of_irq.h> #include <linux/of_mdio.h> #include <linux/of_net.h> #include <linux/phy.h> #include <linux/phy_fixed.h> #define DEFAULT_GPIO_RESET_DELAY 10 /* in microseconds */ MODULE_AUTHOR("Grant Likely <grant.likely@secretlab.ca>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("OpenFirmware MDIO bus (Ethernet PHY) accessors"); /* Extract the clause 22 phy ID from the compatible string of the form * ethernet-phy-idAAAA.BBBB */ static int of_get_phy_id(struct device_node *device, u32 *phy_id) { return fwnode_get_phy_id(of_fwnode_handle(device), phy_id); } int of_mdiobus_phy_device_register(struct mii_bus *mdio, struct phy_device *phy, struct device_node *child, u32 addr) { return fwnode_mdiobus_phy_device_register(mdio, phy, of_fwnode_handle(child), addr); } EXPORT_SYMBOL(of_mdiobus_phy_device_register); static int of_mdiobus_register_phy(struct mii_bus *mdio, struct device_node *child, u32 addr) { return fwnode_mdiobus_register_phy(mdio, of_fwnode_handle(child), addr); } static int of_mdiobus_register_device(struct mii_bus *mdio, struct device_node *child, u32 addr) { struct fwnode_handle *fwnode = of_fwnode_handle(child); struct mdio_device *mdiodev; int rc; mdiodev = mdio_device_create(mdio, addr); if (IS_ERR(mdiodev)) return PTR_ERR(mdiodev); /* Associate the OF node with the device structure so it * can be looked up later. */ fwnode_handle_get(fwnode); device_set_node(&mdiodev->dev, fwnode); /* All data is now stored in the mdiodev struct; register it. */ rc = mdio_device_register(mdiodev); if (rc) { device_set_node(&mdiodev->dev, NULL); fwnode_handle_put(fwnode); mdio_device_free(mdiodev); return rc; } dev_dbg(&mdio->dev, "registered mdio device %pOFn at address %i\n", child, addr); return 0; } /* The following is a list of PHY compatible strings which appear in * some DTBs. The compatible string is never matched against a PHY * driver, so is pointless. We only expect devices which are not PHYs * to have a compatible string, so they can be matched to an MDIO * driver. Encourage users to upgrade their DT blobs to remove these. */ static const struct of_device_id whitelist_phys[] = { { .compatible = "brcm,40nm-ephy" }, { .compatible = "broadcom,bcm5241" }, { .compatible = "marvell,88E1111", }, { .compatible = "marvell,88e1116", }, { .compatible = "marvell,88e1118", }, { .compatible = "marvell,88e1145", }, { .compatible = "marvell,88e1149r", }, { .compatible = "marvell,88e1310", }, { .compatible = "marvell,88E1510", }, { .compatible = "marvell,88E1514", }, { .compatible = "moxa,moxart-rtl8201cp", }, {} }; /* * Return true if the child node is for a phy. It must either: * o Compatible string of "ethernet-phy-idX.X" * o Compatible string of "ethernet-phy-ieee802.3-c45" * o Compatible string of "ethernet-phy-ieee802.3-c22" * o In the white list above (and issue a warning) * o No compatibility string * * A device which is not a phy is expected to have a compatible string * indicating what sort of device it is. */ bool of_mdiobus_child_is_phy(struct device_node *child) { u32 phy_id; if (of_get_phy_id(child, &phy_id) != -EINVAL) return true; if (of_device_is_compatible(child, "ethernet-phy-ieee802.3-c45")) return true; if (of_device_is_compatible(child, "ethernet-phy-ieee802.3-c22")) return true; if (of_match_node(whitelist_phys, child)) { pr_warn(FW_WARN "%pOF: Whitelisted compatible string. Please remove\n", child); return true; } if (!of_property_present(child, "compatible")) return true; return false; } EXPORT_SYMBOL(of_mdiobus_child_is_phy); static int __of_mdiobus_parse_phys(struct mii_bus *mdio, struct device_node *np, bool *scanphys) { struct device_node *child; int addr, rc = 0; /* Loop over the child nodes and register a phy_device for each phy */ for_each_available_child_of_node(np, child) { if (of_node_name_eq(child, "ethernet-phy-package")) { /* Ignore invalid ethernet-phy-package node */ if (!of_property_present(child, "reg")) continue; rc = __of_mdiobus_parse_phys(mdio, child, NULL); if (rc && rc != -ENODEV) goto exit; continue; } addr = of_mdio_parse_addr(&mdio->dev, child); if (addr < 0) { /* Skip scanning for invalid ethernet-phy-package node */ if (scanphys) *scanphys = true; continue; } if (of_mdiobus_child_is_phy(child)) rc = of_mdiobus_register_phy(mdio, child, addr); else rc = of_mdiobus_register_device(mdio, child, addr); if (rc == -ENODEV) dev_err(&mdio->dev, "MDIO device at address %d is missing.\n", addr); else if (rc) goto exit; } return 0; exit: of_node_put(child); return rc; } /** * __of_mdiobus_register - Register mii_bus and create PHYs from the device tree * @mdio: pointer to mii_bus structure * @np: pointer to device_node of MDIO bus. * @owner: module owning the @mdio object. * * This function registers the mii_bus structure and registers a phy_device * for each child node of @np. */ int __of_mdiobus_register(struct mii_bus *mdio, struct device_node *np, struct module *owner) { struct device_node *child; bool scanphys = false; int addr, rc; if (!np) return __mdiobus_register(mdio, owner); /* Do not continue if the node is disabled */ if (!of_device_is_available(np)) return -ENODEV; /* Mask out all PHYs from auto probing. Instead the PHYs listed in * the device tree are populated after the bus has been registered */ mdio->phy_mask = ~0; device_set_node(&mdio->dev, of_fwnode_handle(np)); /* Get bus level PHY reset GPIO details */ mdio->reset_delay_us = DEFAULT_GPIO_RESET_DELAY; of_property_read_u32(np, "reset-delay-us", &mdio->reset_delay_us); mdio->reset_post_delay_us = 0; of_property_read_u32(np, "reset-post-delay-us", &mdio->reset_post_delay_us); /* Register the MDIO bus */ rc = __mdiobus_register(mdio, owner); if (rc) return rc; /* Loop over the child nodes and register a phy_device for each phy */ rc = __of_mdiobus_parse_phys(mdio, np, &scanphys); if (rc) goto unregister; if (!scanphys) return 0; /* auto scan for PHYs with empty reg property */ for_each_available_child_of_node(np, child) { /* Skip PHYs with reg property set or ethernet-phy-package node */ if (of_property_present(child, "reg") || of_node_name_eq(child, "ethernet-phy-package")) continue; for (addr = 0; addr < PHY_MAX_ADDR; addr++) { /* skip already registered PHYs */ if (mdiobus_is_registered_device(mdio, addr)) continue; /* be noisy to encourage people to set reg property */ dev_info(&mdio->dev, "scan phy %pOFn at address %i\n", child, addr); if (of_mdiobus_child_is_phy(child)) { /* -ENODEV is the return code that PHYLIB has * standardized on to indicate that bus * scanning should continue. */ rc = of_mdiobus_register_phy(mdio, child, addr); if (!rc) break; if (rc != -ENODEV) goto put_unregister; } } } return 0; put_unregister: of_node_put(child); unregister: mdiobus_unregister(mdio); return rc; } EXPORT_SYMBOL(__of_mdiobus_register); /** * of_mdio_find_device - Given a device tree node, find the mdio_device * @np: pointer to the mdio_device's device tree node * * If successful, returns a pointer to the mdio_device with the embedded * struct device refcount incremented by one, or NULL on failure. * The caller should call put_device() on the mdio_device after its use */ struct mdio_device *of_mdio_find_device(struct device_node *np) { return fwnode_mdio_find_device(of_fwnode_handle(np)); } EXPORT_SYMBOL(of_mdio_find_device); /** * of_phy_find_device - Give a PHY node, find the phy_device * @phy_np: Pointer to the phy's device tree node * * If successful, returns a pointer to the phy_device with the embedded * struct device refcount incremented by one, or NULL on failure. */ struct phy_device *of_phy_find_device(struct device_node *phy_np) { return fwnode_phy_find_device(of_fwnode_handle(phy_np)); } EXPORT_SYMBOL(of_phy_find_device); /** * of_phy_connect - Connect to the phy described in the device tree * @dev: pointer to net_device claiming the phy * @phy_np: Pointer to device tree node for the PHY * @hndlr: Link state callback for the network device * @flags: flags to pass to the PHY * @iface: PHY data interface type * * If successful, returns a pointer to the phy_device with the embedded * struct device refcount incremented by one, or NULL on failure. The * refcount must be dropped by calling phy_disconnect() or phy_detach(). */ struct phy_device *of_phy_connect(struct net_device *dev, struct device_node *phy_np, void (*hndlr)(struct net_device *), u32 flags, phy_interface_t iface) { struct phy_device *phy = of_phy_find_device(phy_np); int ret; if (!phy) return NULL; phy->dev_flags |= flags; ret = phy_connect_direct(dev, phy, hndlr, iface); /* refcount is held by phy_connect_direct() on success */ put_device(&phy->mdio.dev); return ret ? NULL : phy; } EXPORT_SYMBOL(of_phy_connect); /** * of_phy_get_and_connect * - Get phy node and connect to the phy described in the device tree * @dev: pointer to net_device claiming the phy * @np: Pointer to device tree node for the net_device claiming the phy * @hndlr: Link state callback for the network device * * If successful, returns a pointer to the phy_device with the embedded * struct device refcount incremented by one, or NULL on failure. The * refcount must be dropped by calling phy_disconnect() or phy_detach(). */ struct phy_device *of_phy_get_and_connect(struct net_device *dev, struct device_node *np, void (*hndlr)(struct net_device *)) { phy_interface_t iface; struct device_node *phy_np; struct phy_device *phy; int ret; ret = of_get_phy_mode(np, &iface); if (ret) return NULL; if (of_phy_is_fixed_link(np)) { ret = of_phy_register_fixed_link(np); if (ret < 0) { netdev_err(dev, "broken fixed-link specification\n"); return NULL; } phy_np = of_node_get(np); } else { phy_np = of_parse_phandle(np, "phy-handle", 0); if (!phy_np) return NULL; } phy = of_phy_connect(dev, phy_np, hndlr, 0, iface); of_node_put(phy_np); return phy; } EXPORT_SYMBOL(of_phy_get_and_connect); /* * of_phy_is_fixed_link() and of_phy_register_fixed_link() must * support two DT bindings: * - the old DT binding, where 'fixed-link' was a property with 5 * cells encoding various information about the fixed PHY * - the new DT binding, where 'fixed-link' is a sub-node of the * Ethernet device. */ bool of_phy_is_fixed_link(struct device_node *np) { struct device_node *dn; int err; const char *managed; /* New binding */ dn = of_get_child_by_name(np, "fixed-link"); if (dn) { of_node_put(dn); return true; } err = of_property_read_string(np, "managed", &managed); if (err == 0 && strcmp(managed, "auto") != 0) return true; /* Old binding */ if (of_property_count_u32_elems(np, "fixed-link") == 5) return true; return false; } EXPORT_SYMBOL(of_phy_is_fixed_link); int of_phy_register_fixed_link(struct device_node *np) { struct fixed_phy_status status = {}; struct device_node *fixed_link_node; u32 fixed_link_prop[5]; const char *managed; if (of_property_read_string(np, "managed", &managed) == 0 && strcmp(managed, "in-band-status") == 0) { /* status is zeroed, namely its .link member */ goto register_phy; } /* New binding */ fixed_link_node = of_get_child_by_name(np, "fixed-link"); if (fixed_link_node) { status.link = 1; status.duplex = of_property_read_bool(fixed_link_node, "full-duplex"); if (of_property_read_u32(fixed_link_node, "speed", &status.speed)) { of_node_put(fixed_link_node); return -EINVAL; } status.pause = of_property_read_bool(fixed_link_node, "pause"); status.asym_pause = of_property_read_bool(fixed_link_node, "asym-pause"); of_node_put(fixed_link_node); goto register_phy; } /* Old binding */ if (of_property_read_u32_array(np, "fixed-link", fixed_link_prop, ARRAY_SIZE(fixed_link_prop)) == 0) { pr_warn_once("%pOF uses deprecated array-style fixed-link binding!\n", np); status.link = 1; status.duplex = fixed_link_prop[1]; status.speed = fixed_link_prop[2]; status.pause = fixed_link_prop[3]; status.asym_pause = fixed_link_prop[4]; goto register_phy; } return -ENODEV; register_phy: return PTR_ERR_OR_ZERO(fixed_phy_register(&status, np)); } EXPORT_SYMBOL(of_phy_register_fixed_link); void of_phy_deregister_fixed_link(struct device_node *np) { struct phy_device *phydev; phydev = of_phy_find_device(np); if (!phydev) return; fixed_phy_unregister(phydev); put_device(&phydev->mdio.dev); /* of_phy_find_device() */ } EXPORT_SYMBOL(of_phy_deregister_fixed_link); |
| 7 2 5 10 13 13 10 2 8 7 2 2 4 4 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 | // SPDX-License-Identifier: GPL-2.0-or-later /* * CTR: Counter mode * * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> */ #include <crypto/algapi.h> #include <crypto/ctr.h> #include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> struct crypto_rfc3686_ctx { struct crypto_skcipher *child; u8 nonce[CTR_RFC3686_NONCE_SIZE]; }; struct crypto_rfc3686_req_ctx { u8 iv[CTR_RFC3686_BLOCK_SIZE]; struct skcipher_request subreq CRYPTO_MINALIGN_ATTR; }; static void crypto_ctr_crypt_final(struct skcipher_walk *walk, struct crypto_cipher *tfm) { unsigned int bsize = crypto_cipher_blocksize(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); u8 *ctrblk = walk->iv; u8 tmp[MAX_CIPHER_BLOCKSIZE + MAX_CIPHER_ALIGNMASK]; u8 *keystream = PTR_ALIGN(tmp + 0, alignmask + 1); const u8 *src = walk->src.virt.addr; u8 *dst = walk->dst.virt.addr; unsigned int nbytes = walk->nbytes; crypto_cipher_encrypt_one(tfm, keystream, ctrblk); crypto_xor_cpy(dst, keystream, src, nbytes); crypto_inc(ctrblk, bsize); } static int crypto_ctr_crypt_segment(struct skcipher_walk *walk, struct crypto_cipher *tfm) { void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = crypto_cipher_alg(tfm)->cia_encrypt; unsigned int bsize = crypto_cipher_blocksize(tfm); u8 *ctrblk = walk->iv; const u8 *src = walk->src.virt.addr; u8 *dst = walk->dst.virt.addr; unsigned int nbytes = walk->nbytes; do { /* create keystream */ fn(crypto_cipher_tfm(tfm), dst, ctrblk); crypto_xor(dst, src, bsize); /* increment counter in counterblock */ crypto_inc(ctrblk, bsize); src += bsize; dst += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static int crypto_ctr_crypt_inplace(struct skcipher_walk *walk, struct crypto_cipher *tfm) { void (*fn)(struct crypto_tfm *, u8 *, const u8 *) = crypto_cipher_alg(tfm)->cia_encrypt; unsigned int bsize = crypto_cipher_blocksize(tfm); unsigned long alignmask = crypto_cipher_alignmask(tfm); unsigned int nbytes = walk->nbytes; u8 *dst = walk->dst.virt.addr; u8 *ctrblk = walk->iv; u8 tmp[MAX_CIPHER_BLOCKSIZE + MAX_CIPHER_ALIGNMASK]; u8 *keystream = PTR_ALIGN(tmp + 0, alignmask + 1); do { /* create keystream */ fn(crypto_cipher_tfm(tfm), keystream, ctrblk); crypto_xor(dst, keystream, bsize); /* increment counter in counterblock */ crypto_inc(ctrblk, bsize); dst += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static int crypto_ctr_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); const unsigned int bsize = crypto_cipher_blocksize(cipher); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes >= bsize) { if (walk.src.virt.addr == walk.dst.virt.addr) nbytes = crypto_ctr_crypt_inplace(&walk, cipher); else nbytes = crypto_ctr_crypt_segment(&walk, cipher); err = skcipher_walk_done(&walk, nbytes); } if (walk.nbytes) { crypto_ctr_crypt_final(&walk, cipher); err = skcipher_walk_done(&walk, 0); } return err; } static int crypto_ctr_create(struct crypto_template *tmpl, struct rtattr **tb) { struct skcipher_instance *inst; struct crypto_alg *alg; int err; inst = skcipher_alloc_instance_simple(tmpl, tb); if (IS_ERR(inst)) return PTR_ERR(inst); alg = skcipher_ialg_simple(inst); /* Block size must be >= 4 bytes. */ err = -EINVAL; if (alg->cra_blocksize < 4) goto out_free_inst; /* If this is false we'd fail the alignment of crypto_inc. */ if (alg->cra_blocksize % 4) goto out_free_inst; /* CTR mode is a stream cipher. */ inst->alg.base.cra_blocksize = 1; /* * To simplify the implementation, configure the skcipher walk to only * give a partial block at the very end, never earlier. */ inst->alg.chunksize = alg->cra_blocksize; inst->alg.encrypt = crypto_ctr_crypt; inst->alg.decrypt = crypto_ctr_crypt; err = skcipher_register_instance(tmpl, inst); if (err) { out_free_inst: inst->free(inst); } return err; } static int crypto_rfc3686_setkey(struct crypto_skcipher *parent, const u8 *key, unsigned int keylen) { struct crypto_rfc3686_ctx *ctx = crypto_skcipher_ctx(parent); struct crypto_skcipher *child = ctx->child; /* the nonce is stored in bytes at end of key */ if (keylen < CTR_RFC3686_NONCE_SIZE) return -EINVAL; memcpy(ctx->nonce, key + (keylen - CTR_RFC3686_NONCE_SIZE), CTR_RFC3686_NONCE_SIZE); keylen -= CTR_RFC3686_NONCE_SIZE; crypto_skcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(child, crypto_skcipher_get_flags(parent) & CRYPTO_TFM_REQ_MASK); return crypto_skcipher_setkey(child, key, keylen); } static int crypto_rfc3686_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_rfc3686_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_skcipher *child = ctx->child; unsigned long align = crypto_skcipher_alignmask(tfm); struct crypto_rfc3686_req_ctx *rctx = (void *)PTR_ALIGN((u8 *)skcipher_request_ctx(req), align + 1); struct skcipher_request *subreq = &rctx->subreq; u8 *iv = rctx->iv; /* set up counter block */ memcpy(iv, ctx->nonce, CTR_RFC3686_NONCE_SIZE); memcpy(iv + CTR_RFC3686_NONCE_SIZE, req->iv, CTR_RFC3686_IV_SIZE); /* initialize counter portion of counter block */ *(__be32 *)(iv + CTR_RFC3686_NONCE_SIZE + CTR_RFC3686_IV_SIZE) = cpu_to_be32(1); skcipher_request_set_tfm(subreq, child); skcipher_request_set_callback(subreq, req->base.flags, req->base.complete, req->base.data); skcipher_request_set_crypt(subreq, req->src, req->dst, req->cryptlen, iv); return crypto_skcipher_encrypt(subreq); } static int crypto_rfc3686_init_tfm(struct crypto_skcipher *tfm) { struct skcipher_instance *inst = skcipher_alg_instance(tfm); struct crypto_skcipher_spawn *spawn = skcipher_instance_ctx(inst); struct crypto_rfc3686_ctx *ctx = crypto_skcipher_ctx(tfm); struct crypto_skcipher *cipher; unsigned long align; unsigned int reqsize; cipher = crypto_spawn_skcipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; align = crypto_skcipher_alignmask(tfm); align &= ~(crypto_tfm_ctx_alignment() - 1); reqsize = align + sizeof(struct crypto_rfc3686_req_ctx) + crypto_skcipher_reqsize(cipher); crypto_skcipher_set_reqsize(tfm, reqsize); return 0; } static void crypto_rfc3686_exit_tfm(struct crypto_skcipher *tfm) { struct crypto_rfc3686_ctx *ctx = crypto_skcipher_ctx(tfm); crypto_free_skcipher(ctx->child); } static void crypto_rfc3686_free(struct skcipher_instance *inst) { struct crypto_skcipher_spawn *spawn = skcipher_instance_ctx(inst); crypto_drop_skcipher(spawn); kfree(inst); } static int crypto_rfc3686_create(struct crypto_template *tmpl, struct rtattr **tb) { struct skcipher_instance *inst; struct crypto_skcipher_spawn *spawn; struct skcipher_alg_common *alg; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SKCIPHER, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = skcipher_instance_ctx(inst); err = crypto_grab_skcipher(spawn, skcipher_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_skcipher_alg_common(spawn); /* We only support 16-byte blocks. */ err = -EINVAL; if (alg->ivsize != CTR_RFC3686_BLOCK_SIZE) goto err_free_inst; /* Not a stream cipher? */ if (alg->base.cra_blocksize != 1) goto err_free_inst; err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "rfc3686(%s)", alg->base.cra_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "rfc3686(%s)", alg->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = alg->base.cra_priority; inst->alg.base.cra_blocksize = 1; inst->alg.base.cra_alignmask = alg->base.cra_alignmask; inst->alg.ivsize = CTR_RFC3686_IV_SIZE; inst->alg.chunksize = alg->chunksize; inst->alg.min_keysize = alg->min_keysize + CTR_RFC3686_NONCE_SIZE; inst->alg.max_keysize = alg->max_keysize + CTR_RFC3686_NONCE_SIZE; inst->alg.setkey = crypto_rfc3686_setkey; inst->alg.encrypt = crypto_rfc3686_crypt; inst->alg.decrypt = crypto_rfc3686_crypt; inst->alg.base.cra_ctxsize = sizeof(struct crypto_rfc3686_ctx); inst->alg.init = crypto_rfc3686_init_tfm; inst->alg.exit = crypto_rfc3686_exit_tfm; inst->free = crypto_rfc3686_free; err = skcipher_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_rfc3686_free(inst); } return err; } static struct crypto_template crypto_ctr_tmpls[] = { { .name = "ctr", .create = crypto_ctr_create, .module = THIS_MODULE, }, { .name = "rfc3686", .create = crypto_rfc3686_create, .module = THIS_MODULE, }, }; static int __init crypto_ctr_module_init(void) { return crypto_register_templates(crypto_ctr_tmpls, ARRAY_SIZE(crypto_ctr_tmpls)); } static void __exit crypto_ctr_module_exit(void) { crypto_unregister_templates(crypto_ctr_tmpls, ARRAY_SIZE(crypto_ctr_tmpls)); } module_init(crypto_ctr_module_init); module_exit(crypto_ctr_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("CTR block cipher mode of operation"); MODULE_ALIAS_CRYPTO("rfc3686"); MODULE_ALIAS_CRYPTO("ctr"); MODULE_IMPORT_NS("CRYPTO_INTERNAL"); |
| 235 183 117 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer MIDI-through client * Copyright (c) 1999-2000 by Takashi Iwai <tiwai@suse.de> */ #include <linux/init.h> #include <linux/slab.h> #include <linux/module.h> #include <sound/core.h> #include "seq_clientmgr.h" #include <sound/initval.h> #include <sound/asoundef.h> /* Sequencer MIDI-through client This gives a simple midi-through client. All the normal input events are redirected to output port immediately. The routing can be done via aconnect program in alsa-utils. Each client has a static client number 14 (= SNDRV_SEQ_CLIENT_DUMMY). If you want to auto-load this module, you may add the following alias in your /etc/conf.modules file. alias snd-seq-client-14 snd-seq-dummy The module is loaded on demand for client 14, or /proc/asound/seq/ is accessed. If you don't need this module to be loaded, alias snd-seq-client-14 as "off". This will help modprobe. The number of ports to be created can be specified via the module parameter "ports". For example, to create four ports, add the following option in a configuration file under /etc/modprobe.d/: option snd-seq-dummy ports=4 The model option "duplex=1" enables duplex operation to the port. In duplex mode, a pair of ports are created instead of single port, and events are tunneled between pair-ports. For example, input to port A is sent to output port of another port B and vice versa. In duplex mode, each port has DUPLEX capability. */ MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>"); MODULE_DESCRIPTION("ALSA sequencer MIDI-through client"); MODULE_LICENSE("GPL"); MODULE_ALIAS("snd-seq-client-" __stringify(SNDRV_SEQ_CLIENT_DUMMY)); static int ports = 1; static bool duplex; module_param(ports, int, 0444); MODULE_PARM_DESC(ports, "number of ports to be created"); module_param(duplex, bool, 0444); MODULE_PARM_DESC(duplex, "create DUPLEX ports"); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) static int ump; module_param(ump, int, 0444); MODULE_PARM_DESC(ump, "UMP conversion (0: no convert, 1: MIDI 1.0, 2: MIDI 2.0)"); #endif struct snd_seq_dummy_port { int client; int port; int duplex; int connect; }; static int my_client = -1; /* * event input callback - just redirect events to subscribers */ static int dummy_input(struct snd_seq_event *ev, int direct, void *private_data, int atomic, int hop) { struct snd_seq_dummy_port *p; struct snd_seq_event tmpev; p = private_data; if (ev->source.client == SNDRV_SEQ_CLIENT_SYSTEM || ev->type == SNDRV_SEQ_EVENT_KERNEL_ERROR) return 0; /* ignore system messages */ tmpev = *ev; if (p->duplex) tmpev.source.port = p->connect; else tmpev.source.port = p->port; tmpev.dest.client = SNDRV_SEQ_ADDRESS_SUBSCRIBERS; return snd_seq_kernel_client_dispatch(p->client, &tmpev, atomic, hop); } /* * free_private callback */ static void dummy_free(void *private_data) { kfree(private_data); } /* * create a port */ static struct snd_seq_dummy_port __init * create_port(int idx, int type) { struct snd_seq_port_info pinfo; struct snd_seq_port_callback pcb; struct snd_seq_dummy_port *rec; rec = kzalloc(sizeof(*rec), GFP_KERNEL); if (!rec) return NULL; rec->client = my_client; rec->duplex = duplex; rec->connect = 0; memset(&pinfo, 0, sizeof(pinfo)); pinfo.addr.client = my_client; if (duplex) sprintf(pinfo.name, "Midi Through Port-%d:%c", idx, (type ? 'B' : 'A')); else sprintf(pinfo.name, "Midi Through Port-%d", idx); pinfo.capability = SNDRV_SEQ_PORT_CAP_READ | SNDRV_SEQ_PORT_CAP_SUBS_READ; pinfo.capability |= SNDRV_SEQ_PORT_CAP_WRITE | SNDRV_SEQ_PORT_CAP_SUBS_WRITE; if (duplex) pinfo.capability |= SNDRV_SEQ_PORT_CAP_DUPLEX; pinfo.direction = SNDRV_SEQ_PORT_DIR_BIDIRECTION; pinfo.type = SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC | SNDRV_SEQ_PORT_TYPE_SOFTWARE | SNDRV_SEQ_PORT_TYPE_PORT; memset(&pcb, 0, sizeof(pcb)); pcb.owner = THIS_MODULE; pcb.event_input = dummy_input; pcb.private_free = dummy_free; pcb.private_data = rec; pinfo.kernel = &pcb; if (snd_seq_kernel_client_ctl(my_client, SNDRV_SEQ_IOCTL_CREATE_PORT, &pinfo) < 0) { kfree(rec); return NULL; } rec->port = pinfo.addr.port; return rec; } /* * register client and create ports */ static int __init register_client(void) { struct snd_seq_dummy_port *rec1, *rec2; #if IS_ENABLED(CONFIG_SND_SEQ_UMP) struct snd_seq_client *client; #endif int i; if (ports < 1) { pr_err("ALSA: seq_dummy: invalid number of ports %d\n", ports); return -EINVAL; } /* create client */ my_client = snd_seq_create_kernel_client(NULL, SNDRV_SEQ_CLIENT_DUMMY, "Midi Through"); if (my_client < 0) return my_client; #if IS_ENABLED(CONFIG_SND_SEQ_UMP) client = snd_seq_kernel_client_get(my_client); if (!client) return -EINVAL; switch (ump) { case 1: client->midi_version = SNDRV_SEQ_CLIENT_UMP_MIDI_1_0; break; case 2: client->midi_version = SNDRV_SEQ_CLIENT_UMP_MIDI_2_0; break; default: /* don't convert events but just pass-through */ client->filter = SNDRV_SEQ_FILTER_NO_CONVERT; break; } snd_seq_kernel_client_put(client); #endif /* create ports */ for (i = 0; i < ports; i++) { rec1 = create_port(i, 0); if (rec1 == NULL) { snd_seq_delete_kernel_client(my_client); return -ENOMEM; } if (duplex) { rec2 = create_port(i, 1); if (rec2 == NULL) { snd_seq_delete_kernel_client(my_client); return -ENOMEM; } rec1->connect = rec2->port; rec2->connect = rec1->port; } } return 0; } /* * delete client if exists */ static void __exit delete_client(void) { if (my_client >= 0) snd_seq_delete_kernel_client(my_client); } /* * Init part */ static int __init alsa_seq_dummy_init(void) { return register_client(); } static void __exit alsa_seq_dummy_exit(void) { delete_client(); } module_init(alsa_seq_dummy_init) module_exit(alsa_seq_dummy_exit) |
| 1 1 14 2 8 5 2 5 1 4 4 4 4 1 4 1 5 6 6 7 6 1 1 1 8 2 2 483 484 470 28 9 4 6 5 5 3 1 2 581 408 206 123 49 48 2620 2619 581 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 | /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> #include <net/flow_offload.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> #include <net/pkt_cls.h> static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions) { struct nft_flow_rule *flow; flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL); if (!flow) return NULL; flow->rule = flow_rule_alloc(num_actions); if (!flow->rule) { kfree(flow); return NULL; } flow->rule->match.dissector = &flow->match.dissector; flow->rule->match.mask = &flow->match.mask; flow->rule->match.key = &flow->match.key; return flow; } void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow, enum flow_dissector_key_id addr_type) { struct nft_flow_match *match = &flow->match; struct nft_flow_key *mask = &match->mask; struct nft_flow_key *key = &match->key; if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL)) return; key->control.addr_type = addr_type; mask->control.addr_type = 0xffff; match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL); match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] = offsetof(struct nft_flow_key, control); } struct nft_offload_ethertype { __be16 value; __be16 mask; }; static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow) { struct nft_flow_match *match = &flow->match; struct nft_offload_ethertype ethertype = { .value = match->key.basic.n_proto, .mask = match->mask.basic.n_proto, }; if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) && (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) || match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.cvlan.vlan_tpid; match->mask.basic.n_proto = match->mask.cvlan.vlan_tpid; match->key.cvlan.vlan_tpid = match->key.vlan.vlan_tpid; match->mask.cvlan.vlan_tpid = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] = offsetof(struct nft_flow_key, cvlan); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN); } else if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) && (match->key.basic.n_proto == htons(ETH_P_8021Q) || match->key.basic.n_proto == htons(ETH_P_8021AD))) { match->key.basic.n_proto = match->key.vlan.vlan_tpid; match->mask.basic.n_proto = match->mask.vlan.vlan_tpid; match->key.vlan.vlan_tpid = ethertype.value; match->mask.vlan.vlan_tpid = ethertype.mask; match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] = offsetof(struct nft_flow_key, vlan); match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN); } } struct nft_flow_rule *nft_flow_rule_create(struct net *net, const struct nft_rule *rule) { struct nft_offload_ctx *ctx; struct nft_flow_rule *flow; int num_actions = 0, err; struct nft_expr *expr; expr = nft_expr_first(rule); while (nft_expr_more(rule, expr)) { if (expr->ops->offload_action && expr->ops->offload_action(expr)) num_actions++; expr = nft_expr_next(expr); } if (num_actions == 0) return ERR_PTR(-EOPNOTSUPP); flow = nft_flow_rule_alloc(num_actions); if (!flow) return ERR_PTR(-ENOMEM); expr = nft_expr_first(rule); ctx = kzalloc(sizeof(struct nft_offload_ctx), GFP_KERNEL); if (!ctx) { err = -ENOMEM; goto err_out; } ctx->net = net; ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; while (nft_expr_more(rule, expr)) { if (!expr->ops->offload) { err = -EOPNOTSUPP; goto err_out; } err = expr->ops->offload(ctx, flow, expr); if (err < 0) goto err_out; expr = nft_expr_next(expr); } nft_flow_rule_transfer_vlan(ctx, flow); flow->proto = ctx->dep.l3num; kfree(ctx); return flow; err_out: kfree(ctx); nft_flow_rule_destroy(flow); return ERR_PTR(err); } void nft_flow_rule_destroy(struct nft_flow_rule *flow) { struct flow_action_entry *entry; int i; flow_action_for_each(i, entry, &flow->rule->action) { switch (entry->id) { case FLOW_ACTION_REDIRECT: case FLOW_ACTION_MIRRED: dev_put(entry->dev); break; default: break; } } kfree(flow->rule); kfree(flow); } void nft_offload_set_dependency(struct nft_offload_ctx *ctx, enum nft_offload_dep_type type) { ctx->dep.type = type; } void nft_offload_update_dependency(struct nft_offload_ctx *ctx, const void *data, u32 len) { switch (ctx->dep.type) { case NFT_OFFLOAD_DEP_NETWORK: WARN_ON(len != sizeof(__u16)); memcpy(&ctx->dep.l3num, data, sizeof(__u16)); break; case NFT_OFFLOAD_DEP_TRANSPORT: WARN_ON(len != sizeof(__u8)); memcpy(&ctx->dep.protonum, data, sizeof(__u8)); break; default: break; } ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC; } static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, __be16 proto, int priority, struct netlink_ext_ack *extack) { common->protocol = proto; common->prio = priority; common->extack = extack; } static int nft_setup_cb_call(enum tc_setup_type type, void *type_data, struct list_head *cb_list) { struct flow_block_cb *block_cb; int err; list_for_each_entry(block_cb, cb_list, list) { err = block_cb->cb(type, type_data, block_cb->cb_priv); if (err < 0) return err; } return 0; } static int nft_chain_offload_priority(const struct nft_base_chain *basechain) { if (basechain->ops.priority <= 0 || basechain->ops.priority > USHRT_MAX) return -1; return 0; } bool nft_chain_offload_support(const struct nft_base_chain *basechain) { struct nf_hook_ops *ops; struct net_device *dev; struct nft_hook *hook; if (nft_chain_offload_priority(basechain) < 0) return false; list_for_each_entry(hook, &basechain->hook_list, list) { list_for_each_entry(ops, &hook->ops_list, list) { if (ops->pf != NFPROTO_NETDEV || ops->hooknum != NF_NETDEV_INGRESS) return false; dev = ops->dev; if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) return false; } } return true; } static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow, const struct nft_base_chain *basechain, const struct nft_rule *rule, const struct nft_flow_rule *flow, struct netlink_ext_ack *extack, enum flow_cls_command command) { __be16 proto = ETH_P_ALL; memset(cls_flow, 0, sizeof(*cls_flow)); if (flow) proto = flow->proto; nft_flow_offload_common_init(&cls_flow->common, proto, basechain->ops.priority, extack); cls_flow->command = command; cls_flow->cookie = (unsigned long) rule; if (flow) cls_flow->rule = flow->rule; } static int nft_flow_offload_cmd(const struct nft_chain *chain, const struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command, struct flow_cls_offload *cls_flow) { struct netlink_ext_ack extack = {}; struct nft_base_chain *basechain; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); nft_flow_cls_offload_setup(cls_flow, basechain, rule, flow, &extack, command); return nft_setup_cb_call(TC_SETUP_CLSFLOWER, cls_flow, &basechain->flow_block.cb_list); } static int nft_flow_offload_rule(const struct nft_chain *chain, struct nft_rule *rule, struct nft_flow_rule *flow, enum flow_cls_command command) { struct flow_cls_offload cls_flow; return nft_flow_offload_cmd(chain, rule, flow, command, &cls_flow); } int nft_flow_rule_stats(const struct nft_chain *chain, const struct nft_rule *rule) { struct flow_cls_offload cls_flow = {}; struct nft_expr *expr, *next; int err; err = nft_flow_offload_cmd(chain, rule, NULL, FLOW_CLS_STATS, &cls_flow); if (err < 0) return err; nft_rule_for_each_expr(expr, next, rule) { if (expr->ops->offload_stats) expr->ops->offload_stats(expr, &cls_flow.stats); } return 0; } static int nft_flow_offload_bind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { list_splice(&bo->cb_list, &basechain->flow_block.cb_list); return 0; } static int nft_flow_offload_unbind(struct flow_block_offload *bo, struct nft_base_chain *basechain) { struct flow_block_cb *block_cb, *next; struct flow_cls_offload cls_flow; struct netlink_ext_ack extack; struct nft_chain *chain; struct nft_rule *rule; chain = &basechain->chain; list_for_each_entry(rule, &chain->rules, list) { memset(&extack, 0, sizeof(extack)); nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL, &extack, FLOW_CLS_DESTROY); nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list); } list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) { list_del(&block_cb->list); flow_block_cb_free(block_cb); } return 0; } static int nft_block_setup(struct nft_base_chain *basechain, struct flow_block_offload *bo, enum flow_block_command cmd) { int err; switch (cmd) { case FLOW_BLOCK_BIND: err = nft_flow_offload_bind(bo, basechain); break; case FLOW_BLOCK_UNBIND: err = nft_flow_offload_unbind(bo, basechain); break; default: WARN_ON_ONCE(1); err = -EOPNOTSUPP; } return err; } static void nft_flow_block_offload_init(struct flow_block_offload *bo, struct net *net, enum flow_block_command cmd, struct nft_base_chain *basechain, struct netlink_ext_ack *extack) { memset(bo, 0, sizeof(*bo)); bo->net = net; bo->block = &basechain->flow_block; bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; bo->cb_list_head = &basechain->flow_block.cb_list; INIT_LIST_HEAD(&bo->cb_list); } static int nft_block_offload_cmd(struct nft_base_chain *chain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); if (err < 0) return err; return nft_block_setup(chain, &bo, cmd); } static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) { struct nft_base_chain *basechain = block_cb->indr.data; struct net_device *dev = block_cb->indr.dev; struct netlink_ext_ack extack = {}; struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct flow_block_offload bo; nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, basechain, &extack); nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); list_del(&block_cb->driver_list); list_move(&block_cb->list, &bo.cb_list); nft_flow_offload_unbind(&bo, basechain); mutex_unlock(&nft_net->commit_mutex); } static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; int err; nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack); err = flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_BLOCK, basechain, &bo, nft_indr_block_cleanup); if (err < 0) return err; if (list_empty(&bo.cb_list)) return -EOPNOTSUPP; return nft_block_setup(basechain, &bo, cmd); } static int nft_chain_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { int err; if (dev->netdev_ops->ndo_setup_tc) err = nft_block_offload_cmd(basechain, dev, cmd); else err = nft_indr_block_offload_cmd(basechain, dev, cmd); return err; } static int nft_flow_block_chain(struct nft_base_chain *basechain, const struct net_device *this_dev, enum flow_block_command cmd) { struct nf_hook_ops *ops; struct nft_hook *hook; int err, i = 0; list_for_each_entry(hook, &basechain->hook_list, list) { list_for_each_entry(ops, &hook->ops_list, list) { if (this_dev && this_dev != ops->dev) continue; err = nft_chain_offload_cmd(basechain, ops->dev, cmd); if (err < 0 && cmd == FLOW_BLOCK_BIND) { if (!this_dev) goto err_flow_block; return err; } i++; } } return 0; err_flow_block: list_for_each_entry(hook, &basechain->hook_list, list) { list_for_each_entry(ops, &hook->ops_list, list) { if (i-- <= 0) break; nft_chain_offload_cmd(basechain, ops->dev, FLOW_BLOCK_UNBIND); } } return err; } static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy, enum flow_block_command cmd) { struct nft_base_chain *basechain; u8 policy; if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; basechain = nft_base_chain(chain); policy = ppolicy ? *ppolicy : basechain->policy; /* Only default policy to accept is supported for now. */ if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP) return -EOPNOTSUPP; return nft_flow_block_chain(basechain, NULL, cmd); } static void nft_flow_rule_offload_abort(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); int err = 0; list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) { if (trans->table->family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; err = nft_flow_offload_chain(nft_trans_chain(trans), NULL, FLOW_BLOCK_UNBIND); break; case NFT_MSG_DELCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_chain(nft_trans_chain(trans), NULL, FLOW_BLOCK_BIND); break; case NFT_MSG_NEWRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; case NFT_MSG_DELRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; } if (WARN_ON_ONCE(err)) break; } } int nft_flow_rule_offload_commit(struct net *net) { struct nftables_pernet *nft_net = nft_pernet(net); struct nft_trans *trans; int err = 0; u8 policy; list_for_each_entry(trans, &nft_net->commit_list, list) { if (trans->table->family != NFPROTO_NETDEV) continue; switch (trans->msg_type) { case NFT_MSG_NEWCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) || nft_trans_chain_update(trans)) continue; policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(nft_trans_chain(trans), &policy, FLOW_BLOCK_BIND); break; case NFT_MSG_DELCHAIN: if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; policy = nft_trans_chain_policy(trans); err = nft_flow_offload_chain(nft_trans_chain(trans), &policy, FLOW_BLOCK_UNBIND); break; case NFT_MSG_NEWRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; if (trans->flags & NLM_F_REPLACE || !(trans->flags & NLM_F_APPEND)) { err = -EOPNOTSUPP; break; } err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), nft_trans_flow_rule(trans), FLOW_CLS_REPLACE); break; case NFT_MSG_DELRULE: if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)) continue; err = nft_flow_offload_rule(nft_trans_rule_chain(trans), nft_trans_rule(trans), NULL, FLOW_CLS_DESTROY); break; } if (err) { nft_flow_rule_offload_abort(net, trans); break; } } return err; } static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net, struct net_device *dev) { struct nft_base_chain *basechain; struct nft_hook *hook, *found; const struct nft_table *table; struct nft_chain *chain; list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV) continue; list_for_each_entry(chain, &table->chains, list) { if (!nft_is_base_chain(chain) || !(chain->flags & NFT_CHAIN_HW_OFFLOAD)) continue; found = NULL; basechain = nft_base_chain(chain); list_for_each_entry(hook, &basechain->hook_list, list) { if (!nft_hook_find_ops(hook, dev)) continue; found = hook; break; } if (!found) continue; return chain; } } return NULL; } static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nftables_pernet *nft_net; struct net *net = dev_net(dev); struct nft_chain *chain; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); chain = __nft_offload_get_chain(nft_net, dev); if (chain) nft_flow_block_chain(nft_base_chain(chain), dev, FLOW_BLOCK_UNBIND); mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } static struct notifier_block nft_offload_netdev_notifier = { .notifier_call = nft_offload_netdev_event, }; int nft_offload_init(void) { return register_netdevice_notifier(&nft_offload_netdev_notifier); } void nft_offload_exit(void) { unregister_netdevice_notifier(&nft_offload_netdev_notifier); } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/ipv6.h> #include <net/dsfield.h> #include <net/xfrm.h> #ifndef XFRM_INOUT_H #define XFRM_INOUT_H 1 static inline void xfrm4_extract_header(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); XFRM_MODE_SKB_CB(skb)->id = iph->id; XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off; XFRM_MODE_SKB_CB(skb)->tos = iph->tos; XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl; XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph); memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0, sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); } static inline void xfrm6_extract_header(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *iph = ipv6_hdr(skb); XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); XFRM_MODE_SKB_CB(skb)->id = 0; XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF); XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph); XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit; XFRM_MODE_SKB_CB(skb)->optlen = 0; memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); #else WARN_ON_ONCE(1); #endif } static inline void xfrm6_beet_make_header(struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); iph->version = 6; memcpy(iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl, sizeof(iph->flow_lbl)); iph->nexthdr = XFRM_MODE_SKB_CB(skb)->protocol; ipv6_change_dsfield(iph, 0, XFRM_MODE_SKB_CB(skb)->tos); iph->hop_limit = XFRM_MODE_SKB_CB(skb)->ttl; } static inline void xfrm4_beet_make_header(struct sk_buff *skb) { struct iphdr *iph = ip_hdr(skb); iph->ihl = 5; iph->version = 4; iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; iph->tos = XFRM_MODE_SKB_CB(skb)->tos; iph->id = XFRM_MODE_SKB_CB(skb)->id; iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off; iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl; } #endif |
| 7 6 6 1 1 8 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Crypto API wrappers for the ChaCha20, XChaCha20, and XChaCha12 stream ciphers * * Copyright (C) 2015 Martin Willi * Copyright (C) 2018 Google LLC */ #include <linux/unaligned.h> #include <crypto/algapi.h> #include <crypto/chacha.h> #include <crypto/internal/skcipher.h> #include <linux/module.h> struct chacha_ctx { u32 key[8]; int nrounds; }; static int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize, int nrounds) { struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); int i; if (keysize != CHACHA_KEY_SIZE) return -EINVAL; for (i = 0; i < ARRAY_SIZE(ctx->key); i++) ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32)); ctx->nrounds = nrounds; return 0; } static int chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize) { return chacha_setkey(tfm, key, keysize, 20); } static int chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize) { return chacha_setkey(tfm, key, keysize, 12); } static int chacha_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, const u8 iv[CHACHA_IV_SIZE]) { struct skcipher_walk walk; struct chacha_state state; int err; err = skcipher_walk_virt(&walk, req, false); chacha_init(&state, ctx->key, iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; if (nbytes < walk.total) nbytes = round_down(nbytes, CHACHA_BLOCK_SIZE); chacha_crypt(&state, walk.dst.virt.addr, walk.src.virt.addr, nbytes, ctx->nrounds); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } static int crypto_chacha_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); return chacha_stream_xor(req, ctx, req->iv); } static int crypto_xchacha_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); const struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); struct chacha_ctx subctx; struct chacha_state state; u8 real_iv[16]; /* Compute the subkey given the original key and first 128 nonce bits */ chacha_init(&state, ctx->key, req->iv); hchacha_block(&state, subctx.key, ctx->nrounds); subctx.nrounds = ctx->nrounds; /* Build the real IV */ memcpy(&real_iv[0], req->iv + 24, 8); /* stream position */ memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */ /* Generate the stream and XOR it with the data */ return chacha_stream_xor(req, &subctx, real_iv); } static struct skcipher_alg algs[] = { { .base.cra_name = "chacha20", .base.cra_driver_name = "chacha20-lib", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, .encrypt = crypto_chacha_crypt, .decrypt = crypto_chacha_crypt, }, { .base.cra_name = "xchacha20", .base.cra_driver_name = "xchacha20-lib", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, .encrypt = crypto_xchacha_crypt, .decrypt = crypto_xchacha_crypt, }, { .base.cra_name = "xchacha12", .base.cra_driver_name = "xchacha12-lib", .base.cra_priority = 300, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha12_setkey, .encrypt = crypto_xchacha_crypt, .decrypt = crypto_xchacha_crypt, } }; static int __init crypto_chacha_mod_init(void) { return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit crypto_chacha_mod_fini(void) { crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } module_init(crypto_chacha_mod_init); module_exit(crypto_chacha_mod_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); MODULE_DESCRIPTION("Crypto API wrappers for the ChaCha20, XChaCha20, and XChaCha12 stream ciphers"); MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-lib"); MODULE_ALIAS_CRYPTO("xchacha20"); MODULE_ALIAS_CRYPTO("xchacha20-lib"); MODULE_ALIAS_CRYPTO("xchacha12"); MODULE_ALIAS_CRYPTO("xchacha12-lib"); |
| 5 5 1 1 1 2 571 571 571 68 69 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 | // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/bits.h> #include <linux/bitfield.h> #include <linux/idr.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/xarray.h> #include <net/devlink.h> #include <net/net_shaper.h> #include "shaper_nl_gen.h" #include "../core/dev.h" #define NET_SHAPER_SCOPE_SHIFT 26 #define NET_SHAPER_ID_MASK GENMASK(NET_SHAPER_SCOPE_SHIFT - 1, 0) #define NET_SHAPER_SCOPE_MASK GENMASK(31, NET_SHAPER_SCOPE_SHIFT) #define NET_SHAPER_ID_UNSPEC NET_SHAPER_ID_MASK struct net_shaper_hierarchy { struct xarray shapers; }; struct net_shaper_nl_ctx { struct net_shaper_binding binding; netdevice_tracker dev_tracker; unsigned long start_index; }; static struct net_shaper_binding *net_shaper_binding_from_ctx(void *ctx) { return &((struct net_shaper_nl_ctx *)ctx)->binding; } static void net_shaper_lock(struct net_shaper_binding *binding) { switch (binding->type) { case NET_SHAPER_BINDING_TYPE_NETDEV: netdev_lock(binding->netdev); break; } } static void net_shaper_unlock(struct net_shaper_binding *binding) { switch (binding->type) { case NET_SHAPER_BINDING_TYPE_NETDEV: netdev_unlock(binding->netdev); break; } } static struct net_shaper_hierarchy * net_shaper_hierarchy(struct net_shaper_binding *binding) { /* Pairs with WRITE_ONCE() in net_shaper_hierarchy_setup. */ if (binding->type == NET_SHAPER_BINDING_TYPE_NETDEV) return READ_ONCE(binding->netdev->net_shaper_hierarchy); /* No other type supported yet. */ return NULL; } static const struct net_shaper_ops * net_shaper_ops(struct net_shaper_binding *binding) { if (binding->type == NET_SHAPER_BINDING_TYPE_NETDEV) return binding->netdev->netdev_ops->net_shaper_ops; /* No other type supported yet. */ return NULL; } /* Count the number of [multi] attributes of the given type. */ static int net_shaper_list_len(struct genl_info *info, int type) { struct nlattr *attr; int rem, cnt = 0; nla_for_each_attr_type(attr, type, genlmsg_data(info->genlhdr), genlmsg_len(info->genlhdr), rem) cnt++; return cnt; } static int net_shaper_handle_size(void) { return nla_total_size(nla_total_size(sizeof(u32)) + nla_total_size(sizeof(u32))); } static int net_shaper_fill_binding(struct sk_buff *msg, const struct net_shaper_binding *binding, u32 type) { /* Should never happen, as currently only NETDEV is supported. */ if (WARN_ON_ONCE(binding->type != NET_SHAPER_BINDING_TYPE_NETDEV)) return -EINVAL; if (nla_put_u32(msg, type, binding->netdev->ifindex)) return -EMSGSIZE; return 0; } static int net_shaper_fill_handle(struct sk_buff *msg, const struct net_shaper_handle *handle, u32 type) { struct nlattr *handle_attr; if (handle->scope == NET_SHAPER_SCOPE_UNSPEC) return 0; handle_attr = nla_nest_start(msg, type); if (!handle_attr) return -EMSGSIZE; if (nla_put_u32(msg, NET_SHAPER_A_HANDLE_SCOPE, handle->scope) || (handle->scope >= NET_SHAPER_SCOPE_QUEUE && nla_put_u32(msg, NET_SHAPER_A_HANDLE_ID, handle->id))) goto handle_nest_cancel; nla_nest_end(msg, handle_attr); return 0; handle_nest_cancel: nla_nest_cancel(msg, handle_attr); return -EMSGSIZE; } static int net_shaper_fill_one(struct sk_buff *msg, const struct net_shaper_binding *binding, const struct net_shaper *shaper, const struct genl_info *info) { void *hdr; hdr = genlmsg_iput(msg, info); if (!hdr) return -EMSGSIZE; if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_IFINDEX) || net_shaper_fill_handle(msg, &shaper->parent, NET_SHAPER_A_PARENT) || net_shaper_fill_handle(msg, &shaper->handle, NET_SHAPER_A_HANDLE) || ((shaper->bw_min || shaper->bw_max || shaper->burst) && nla_put_u32(msg, NET_SHAPER_A_METRIC, shaper->metric)) || (shaper->bw_min && nla_put_uint(msg, NET_SHAPER_A_BW_MIN, shaper->bw_min)) || (shaper->bw_max && nla_put_uint(msg, NET_SHAPER_A_BW_MAX, shaper->bw_max)) || (shaper->burst && nla_put_uint(msg, NET_SHAPER_A_BURST, shaper->burst)) || (shaper->priority && nla_put_u32(msg, NET_SHAPER_A_PRIORITY, shaper->priority)) || (shaper->weight && nla_put_u32(msg, NET_SHAPER_A_WEIGHT, shaper->weight))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /* Initialize the context fetching the relevant device and * acquiring a reference to it. */ static int net_shaper_ctx_setup(const struct genl_info *info, int type, struct net_shaper_nl_ctx *ctx) { struct net *ns = genl_info_net(info); struct net_device *dev; int ifindex; if (GENL_REQ_ATTR_CHECK(info, type)) return -EINVAL; ifindex = nla_get_u32(info->attrs[type]); dev = netdev_get_by_index(ns, ifindex, &ctx->dev_tracker, GFP_KERNEL); if (!dev) { NL_SET_BAD_ATTR(info->extack, info->attrs[type]); return -ENOENT; } if (!dev->netdev_ops->net_shaper_ops) { NL_SET_BAD_ATTR(info->extack, info->attrs[type]); netdev_put(dev, &ctx->dev_tracker); return -EOPNOTSUPP; } ctx->binding.type = NET_SHAPER_BINDING_TYPE_NETDEV; ctx->binding.netdev = dev; return 0; } static void net_shaper_ctx_cleanup(struct net_shaper_nl_ctx *ctx) { if (ctx->binding.type == NET_SHAPER_BINDING_TYPE_NETDEV) netdev_put(ctx->binding.netdev, &ctx->dev_tracker); } static u32 net_shaper_handle_to_index(const struct net_shaper_handle *handle) { return FIELD_PREP(NET_SHAPER_SCOPE_MASK, handle->scope) | FIELD_PREP(NET_SHAPER_ID_MASK, handle->id); } static void net_shaper_index_to_handle(u32 index, struct net_shaper_handle *handle) { handle->scope = FIELD_GET(NET_SHAPER_SCOPE_MASK, index); handle->id = FIELD_GET(NET_SHAPER_ID_MASK, index); } static void net_shaper_default_parent(const struct net_shaper_handle *handle, struct net_shaper_handle *parent) { switch (handle->scope) { case NET_SHAPER_SCOPE_UNSPEC: case NET_SHAPER_SCOPE_NETDEV: case __NET_SHAPER_SCOPE_MAX: parent->scope = NET_SHAPER_SCOPE_UNSPEC; break; case NET_SHAPER_SCOPE_QUEUE: case NET_SHAPER_SCOPE_NODE: parent->scope = NET_SHAPER_SCOPE_NETDEV; break; } parent->id = 0; } /* * MARK_0 is already in use due to XA_FLAGS_ALLOC, can't reuse such flag as * it's cleared by xa_store(). */ #define NET_SHAPER_NOT_VALID XA_MARK_1 static struct net_shaper * net_shaper_lookup(struct net_shaper_binding *binding, const struct net_shaper_handle *handle) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); u32 index = net_shaper_handle_to_index(handle); if (!hierarchy || xa_get_mark(&hierarchy->shapers, index, NET_SHAPER_NOT_VALID)) return NULL; return xa_load(&hierarchy->shapers, index); } /* Allocate on demand the per device shaper's hierarchy container. * Called under the net shaper lock */ static struct net_shaper_hierarchy * net_shaper_hierarchy_setup(struct net_shaper_binding *binding) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); if (hierarchy) return hierarchy; hierarchy = kmalloc(sizeof(*hierarchy), GFP_KERNEL); if (!hierarchy) return NULL; /* The flag is required for ID allocation */ xa_init_flags(&hierarchy->shapers, XA_FLAGS_ALLOC); switch (binding->type) { case NET_SHAPER_BINDING_TYPE_NETDEV: /* Pairs with READ_ONCE in net_shaper_hierarchy. */ WRITE_ONCE(binding->netdev->net_shaper_hierarchy, hierarchy); break; } return hierarchy; } /* Prepare the hierarchy container to actually insert the given shaper, doing * in advance the needed allocations. */ static int net_shaper_pre_insert(struct net_shaper_binding *binding, struct net_shaper_handle *handle, struct netlink_ext_ack *extack) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); struct net_shaper *prev, *cur; bool id_allocated = false; int ret, index; if (!hierarchy) return -ENOMEM; index = net_shaper_handle_to_index(handle); cur = xa_load(&hierarchy->shapers, index); if (cur) return 0; /* Allocated a new id, if needed. */ if (handle->scope == NET_SHAPER_SCOPE_NODE && handle->id == NET_SHAPER_ID_UNSPEC) { u32 min, max; handle->id = NET_SHAPER_ID_MASK - 1; max = net_shaper_handle_to_index(handle); handle->id = 0; min = net_shaper_handle_to_index(handle); ret = xa_alloc(&hierarchy->shapers, &index, NULL, XA_LIMIT(min, max), GFP_KERNEL); if (ret < 0) { NL_SET_ERR_MSG(extack, "Can't allocate new id for NODE shaper"); return ret; } net_shaper_index_to_handle(index, handle); id_allocated = true; } cur = kzalloc(sizeof(*cur), GFP_KERNEL); if (!cur) { ret = -ENOMEM; goto free_id; } /* Mark 'tentative' shaper inside the hierarchy container. * xa_set_mark is a no-op if the previous store fails. */ xa_lock(&hierarchy->shapers); prev = __xa_store(&hierarchy->shapers, index, cur, GFP_KERNEL); __xa_set_mark(&hierarchy->shapers, index, NET_SHAPER_NOT_VALID); xa_unlock(&hierarchy->shapers); if (xa_err(prev)) { NL_SET_ERR_MSG(extack, "Can't insert shaper into device store"); kfree_rcu(cur, rcu); ret = xa_err(prev); goto free_id; } return 0; free_id: if (id_allocated) xa_erase(&hierarchy->shapers, index); return ret; } /* Commit the tentative insert with the actual values. * Must be called only after a successful net_shaper_pre_insert(). */ static void net_shaper_commit(struct net_shaper_binding *binding, int nr_shapers, const struct net_shaper *shapers) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); struct net_shaper *cur; int index; int i; xa_lock(&hierarchy->shapers); for (i = 0; i < nr_shapers; ++i) { index = net_shaper_handle_to_index(&shapers[i].handle); cur = xa_load(&hierarchy->shapers, index); if (WARN_ON_ONCE(!cur)) continue; /* Successful update: drop the tentative mark * and update the hierarchy container. */ __xa_clear_mark(&hierarchy->shapers, index, NET_SHAPER_NOT_VALID); *cur = shapers[i]; } xa_unlock(&hierarchy->shapers); } /* Rollback all the tentative inserts from the hierarchy. */ static void net_shaper_rollback(struct net_shaper_binding *binding) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); struct net_shaper *cur; unsigned long index; if (!hierarchy) return; xa_lock(&hierarchy->shapers); xa_for_each_marked(&hierarchy->shapers, index, cur, NET_SHAPER_NOT_VALID) { __xa_erase(&hierarchy->shapers, index); kfree(cur); } xa_unlock(&hierarchy->shapers); } static int net_shaper_parse_handle(const struct nlattr *attr, const struct genl_info *info, struct net_shaper_handle *handle) { struct nlattr *tb[NET_SHAPER_A_HANDLE_MAX + 1]; struct nlattr *id_attr; u32 id = 0; int ret; ret = nla_parse_nested(tb, NET_SHAPER_A_HANDLE_MAX, attr, net_shaper_handle_nl_policy, info->extack); if (ret < 0) return ret; if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NET_SHAPER_A_HANDLE_SCOPE)) return -EINVAL; handle->scope = nla_get_u32(tb[NET_SHAPER_A_HANDLE_SCOPE]); /* The default id for NODE scope shapers is an invalid one * to help the 'group' operation discriminate between new * NODE shaper creation (ID_UNSPEC) and reuse of existing * shaper (any other value). */ id_attr = tb[NET_SHAPER_A_HANDLE_ID]; if (id_attr) id = nla_get_u32(id_attr); else if (handle->scope == NET_SHAPER_SCOPE_NODE) id = NET_SHAPER_ID_UNSPEC; handle->id = id; return 0; } static int net_shaper_validate_caps(struct net_shaper_binding *binding, struct nlattr **tb, const struct genl_info *info, struct net_shaper *shaper) { const struct net_shaper_ops *ops = net_shaper_ops(binding); struct nlattr *bad = NULL; unsigned long caps = 0; ops->capabilities(binding, shaper->handle.scope, &caps); if (tb[NET_SHAPER_A_PRIORITY] && !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_PRIORITY))) bad = tb[NET_SHAPER_A_PRIORITY]; if (tb[NET_SHAPER_A_WEIGHT] && !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_WEIGHT))) bad = tb[NET_SHAPER_A_WEIGHT]; if (tb[NET_SHAPER_A_BW_MIN] && !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_BW_MIN))) bad = tb[NET_SHAPER_A_BW_MIN]; if (tb[NET_SHAPER_A_BW_MAX] && !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_BW_MAX))) bad = tb[NET_SHAPER_A_BW_MAX]; if (tb[NET_SHAPER_A_BURST] && !(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_BURST))) bad = tb[NET_SHAPER_A_BURST]; if (!caps) bad = tb[NET_SHAPER_A_HANDLE]; if (bad) { NL_SET_BAD_ATTR(info->extack, bad); return -EOPNOTSUPP; } if (shaper->handle.scope == NET_SHAPER_SCOPE_QUEUE && binding->type == NET_SHAPER_BINDING_TYPE_NETDEV && shaper->handle.id >= binding->netdev->real_num_tx_queues) { NL_SET_ERR_MSG_FMT(info->extack, "Not existing queue id %d max %d", shaper->handle.id, binding->netdev->real_num_tx_queues); return -ENOENT; } /* The metric is really used only if there is *any* rate-related * setting, either in current attributes set or in pre-existing * values. */ if (shaper->burst || shaper->bw_min || shaper->bw_max) { u32 metric_cap = NET_SHAPER_A_CAPS_SUPPORT_METRIC_BPS + shaper->metric; /* The metric test can fail even when the user did not * specify the METRIC attribute. Pointing to rate related * attribute will be confusing, as the attribute itself * could be indeed supported, with a different metric. * Be more specific. */ if (!(caps & BIT(metric_cap))) { NL_SET_ERR_MSG_FMT(info->extack, "Bad metric %d", shaper->metric); return -EOPNOTSUPP; } } return 0; } static int net_shaper_parse_info(struct net_shaper_binding *binding, struct nlattr **tb, const struct genl_info *info, struct net_shaper *shaper, bool *exists) { struct net_shaper *old; int ret; /* The shaper handle is the only mandatory attribute. */ if (NL_REQ_ATTR_CHECK(info->extack, NULL, tb, NET_SHAPER_A_HANDLE)) return -EINVAL; ret = net_shaper_parse_handle(tb[NET_SHAPER_A_HANDLE], info, &shaper->handle); if (ret) return ret; if (shaper->handle.scope == NET_SHAPER_SCOPE_UNSPEC) { NL_SET_BAD_ATTR(info->extack, tb[NET_SHAPER_A_HANDLE]); return -EINVAL; } /* Fetch existing hierarchy, if any, so that user provide info will * incrementally update the existing shaper configuration. */ old = net_shaper_lookup(binding, &shaper->handle); if (old) *shaper = *old; *exists = !!old; if (tb[NET_SHAPER_A_METRIC]) shaper->metric = nla_get_u32(tb[NET_SHAPER_A_METRIC]); if (tb[NET_SHAPER_A_BW_MIN]) shaper->bw_min = nla_get_uint(tb[NET_SHAPER_A_BW_MIN]); if (tb[NET_SHAPER_A_BW_MAX]) shaper->bw_max = nla_get_uint(tb[NET_SHAPER_A_BW_MAX]); if (tb[NET_SHAPER_A_BURST]) shaper->burst = nla_get_uint(tb[NET_SHAPER_A_BURST]); if (tb[NET_SHAPER_A_PRIORITY]) shaper->priority = nla_get_u32(tb[NET_SHAPER_A_PRIORITY]); if (tb[NET_SHAPER_A_WEIGHT]) shaper->weight = nla_get_u32(tb[NET_SHAPER_A_WEIGHT]); ret = net_shaper_validate_caps(binding, tb, info, shaper); if (ret < 0) return ret; return 0; } static int net_shaper_validate_nesting(struct net_shaper_binding *binding, const struct net_shaper *shaper, struct netlink_ext_ack *extack) { const struct net_shaper_ops *ops = net_shaper_ops(binding); unsigned long caps = 0; ops->capabilities(binding, shaper->handle.scope, &caps); if (!(caps & BIT(NET_SHAPER_A_CAPS_SUPPORT_NESTING))) { NL_SET_ERR_MSG_FMT(extack, "Nesting not supported for scope %d", shaper->handle.scope); return -EOPNOTSUPP; } return 0; } /* Fetch the existing leaf and update it with the user-provided * attributes. */ static int net_shaper_parse_leaf(struct net_shaper_binding *binding, const struct nlattr *attr, const struct genl_info *info, const struct net_shaper *node, struct net_shaper *shaper) { struct nlattr *tb[NET_SHAPER_A_WEIGHT + 1]; bool exists; int ret; ret = nla_parse_nested(tb, NET_SHAPER_A_WEIGHT, attr, net_shaper_leaf_info_nl_policy, info->extack); if (ret < 0) return ret; ret = net_shaper_parse_info(binding, tb, info, shaper, &exists); if (ret < 0) return ret; if (shaper->handle.scope != NET_SHAPER_SCOPE_QUEUE) { NL_SET_BAD_ATTR(info->extack, tb[NET_SHAPER_A_HANDLE]); return -EINVAL; } if (node->handle.scope == NET_SHAPER_SCOPE_NODE) { ret = net_shaper_validate_nesting(binding, shaper, info->extack); if (ret < 0) return ret; } if (!exists) net_shaper_default_parent(&shaper->handle, &shaper->parent); return 0; } /* Alike net_parse_shaper_info(), but additionally allow the user specifying * the shaper's parent handle. */ static int net_shaper_parse_node(struct net_shaper_binding *binding, struct nlattr **tb, const struct genl_info *info, struct net_shaper *shaper) { bool exists; int ret; ret = net_shaper_parse_info(binding, tb, info, shaper, &exists); if (ret) return ret; if (shaper->handle.scope != NET_SHAPER_SCOPE_NODE && shaper->handle.scope != NET_SHAPER_SCOPE_NETDEV) { NL_SET_BAD_ATTR(info->extack, tb[NET_SHAPER_A_HANDLE]); return -EINVAL; } if (tb[NET_SHAPER_A_PARENT]) { ret = net_shaper_parse_handle(tb[NET_SHAPER_A_PARENT], info, &shaper->parent); if (ret) return ret; if (shaper->parent.scope != NET_SHAPER_SCOPE_NODE && shaper->parent.scope != NET_SHAPER_SCOPE_NETDEV) { NL_SET_BAD_ATTR(info->extack, tb[NET_SHAPER_A_PARENT]); return -EINVAL; } } return 0; } static int net_shaper_generic_pre(struct genl_info *info, int type) { struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)info->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(info->ctx)); return net_shaper_ctx_setup(info, type, ctx); } int net_shaper_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { return net_shaper_generic_pre(info, NET_SHAPER_A_IFINDEX); } static void net_shaper_generic_post(struct genl_info *info) { net_shaper_ctx_cleanup((struct net_shaper_nl_ctx *)info->ctx); } void net_shaper_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { net_shaper_generic_post(info); } int net_shaper_nl_pre_dumpit(struct netlink_callback *cb) { struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; const struct genl_info *info = genl_info_dump(cb); return net_shaper_ctx_setup(info, NET_SHAPER_A_IFINDEX, ctx); } int net_shaper_nl_post_dumpit(struct netlink_callback *cb) { net_shaper_ctx_cleanup((struct net_shaper_nl_ctx *)cb->ctx); return 0; } int net_shaper_nl_cap_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { return net_shaper_generic_pre(info, NET_SHAPER_A_CAPS_IFINDEX); } void net_shaper_nl_cap_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { net_shaper_generic_post(info); } int net_shaper_nl_cap_pre_dumpit(struct netlink_callback *cb) { struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; return net_shaper_ctx_setup(genl_info_dump(cb), NET_SHAPER_A_CAPS_IFINDEX, ctx); } int net_shaper_nl_cap_post_dumpit(struct netlink_callback *cb) { struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; net_shaper_ctx_cleanup(ctx); return 0; } int net_shaper_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net_shaper_binding *binding; struct net_shaper_handle handle; struct net_shaper *shaper; struct sk_buff *msg; int ret; if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_HANDLE)) return -EINVAL; binding = net_shaper_binding_from_ctx(info->ctx); ret = net_shaper_parse_handle(info->attrs[NET_SHAPER_A_HANDLE], info, &handle); if (ret < 0) return ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; rcu_read_lock(); shaper = net_shaper_lookup(binding, &handle); if (!shaper) { NL_SET_BAD_ATTR(info->extack, info->attrs[NET_SHAPER_A_HANDLE]); rcu_read_unlock(); ret = -ENOENT; goto free_msg; } ret = net_shaper_fill_one(msg, binding, shaper, info); rcu_read_unlock(); if (ret) goto free_msg; ret = genlmsg_reply(msg, info); if (ret) goto free_msg; return 0; free_msg: nlmsg_free(msg); return ret; } int net_shaper_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net_shaper_nl_ctx *ctx = (struct net_shaper_nl_ctx *)cb->ctx; const struct genl_info *info = genl_info_dump(cb); struct net_shaper_hierarchy *hierarchy; struct net_shaper_binding *binding; struct net_shaper *shaper; int ret = 0; /* Don't error out dumps performed before any set operation. */ binding = net_shaper_binding_from_ctx(ctx); hierarchy = net_shaper_hierarchy(binding); if (!hierarchy) return 0; rcu_read_lock(); for (; (shaper = xa_find(&hierarchy->shapers, &ctx->start_index, U32_MAX, XA_PRESENT)); ctx->start_index++) { ret = net_shaper_fill_one(skb, binding, shaper, info); if (ret) break; } rcu_read_unlock(); return ret; } int net_shaper_nl_set_doit(struct sk_buff *skb, struct genl_info *info) { struct net_shaper_hierarchy *hierarchy; struct net_shaper_binding *binding; const struct net_shaper_ops *ops; struct net_shaper_handle handle; struct net_shaper shaper = {}; bool exists; int ret; binding = net_shaper_binding_from_ctx(info->ctx); net_shaper_lock(binding); ret = net_shaper_parse_info(binding, info->attrs, info, &shaper, &exists); if (ret) goto unlock; if (!exists) net_shaper_default_parent(&shaper.handle, &shaper.parent); hierarchy = net_shaper_hierarchy_setup(binding); if (!hierarchy) { ret = -ENOMEM; goto unlock; } /* The 'set' operation can't create node-scope shapers. */ handle = shaper.handle; if (handle.scope == NET_SHAPER_SCOPE_NODE && !net_shaper_lookup(binding, &handle)) { ret = -ENOENT; goto unlock; } ret = net_shaper_pre_insert(binding, &handle, info->extack); if (ret) goto unlock; ops = net_shaper_ops(binding); ret = ops->set(binding, &shaper, info->extack); if (ret) { net_shaper_rollback(binding); goto unlock; } net_shaper_commit(binding, 1, &shaper); unlock: net_shaper_unlock(binding); return ret; } static int __net_shaper_delete(struct net_shaper_binding *binding, struct net_shaper *shaper, struct netlink_ext_ack *extack) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); struct net_shaper_handle parent_handle, handle = shaper->handle; const struct net_shaper_ops *ops = net_shaper_ops(binding); int ret; again: parent_handle = shaper->parent; ret = ops->delete(binding, &handle, extack); if (ret < 0) return ret; xa_erase(&hierarchy->shapers, net_shaper_handle_to_index(&handle)); kfree_rcu(shaper, rcu); /* Eventually delete the parent, if it is left over with no leaves. */ if (parent_handle.scope == NET_SHAPER_SCOPE_NODE) { shaper = net_shaper_lookup(binding, &parent_handle); if (shaper && !--shaper->leaves) { handle = parent_handle; goto again; } } return 0; } static int net_shaper_handle_cmp(const struct net_shaper_handle *a, const struct net_shaper_handle *b) { /* Must avoid holes in struct net_shaper_handle. */ BUILD_BUG_ON(sizeof(*a) != 8); return memcmp(a, b, sizeof(*a)); } static int net_shaper_parent_from_leaves(int leaves_count, const struct net_shaper *leaves, struct net_shaper *node, struct netlink_ext_ack *extack) { struct net_shaper_handle parent = leaves[0].parent; int i; for (i = 1; i < leaves_count; ++i) { if (net_shaper_handle_cmp(&leaves[i].parent, &parent)) { NL_SET_ERR_MSG_FMT(extack, "All the leaves shapers must have the same old parent"); return -EINVAL; } } node->parent = parent; return 0; } static int __net_shaper_group(struct net_shaper_binding *binding, bool update_node, int leaves_count, struct net_shaper *leaves, struct net_shaper *node, struct netlink_ext_ack *extack) { const struct net_shaper_ops *ops = net_shaper_ops(binding); struct net_shaper_handle leaf_handle; struct net_shaper *parent = NULL; bool new_node = false; int i, ret; if (node->handle.scope == NET_SHAPER_SCOPE_NODE) { new_node = node->handle.id == NET_SHAPER_ID_UNSPEC; if (!new_node && !net_shaper_lookup(binding, &node->handle)) { /* The related attribute is not available when * reaching here from the delete() op. */ NL_SET_ERR_MSG_FMT(extack, "Node shaper %d:%d does not exists", node->handle.scope, node->handle.id); return -ENOENT; } /* When unspecified, the node parent scope is inherited from * the leaves. */ if (node->parent.scope == NET_SHAPER_SCOPE_UNSPEC) { ret = net_shaper_parent_from_leaves(leaves_count, leaves, node, extack); if (ret) return ret; } } else { net_shaper_default_parent(&node->handle, &node->parent); } if (node->parent.scope == NET_SHAPER_SCOPE_NODE) { parent = net_shaper_lookup(binding, &node->parent); if (!parent) { NL_SET_ERR_MSG_FMT(extack, "Node parent shaper %d:%d does not exists", node->parent.scope, node->parent.id); return -ENOENT; } ret = net_shaper_validate_nesting(binding, node, extack); if (ret < 0) return ret; } if (update_node) { /* For newly created node scope shaper, the following will * update the handle, due to id allocation. */ ret = net_shaper_pre_insert(binding, &node->handle, extack); if (ret) return ret; } for (i = 0; i < leaves_count; ++i) { leaf_handle = leaves[i].handle; ret = net_shaper_pre_insert(binding, &leaf_handle, extack); if (ret) goto rollback; if (!net_shaper_handle_cmp(&leaves[i].parent, &node->handle)) continue; /* The leaves shapers will be nested to the node, update the * linking accordingly. */ leaves[i].parent = node->handle; node->leaves++; } ret = ops->group(binding, leaves_count, leaves, node, extack); if (ret < 0) goto rollback; /* The node's parent gains a new leaf only when the node itself * is created by this group operation */ if (new_node && parent) parent->leaves++; if (update_node) net_shaper_commit(binding, 1, node); net_shaper_commit(binding, leaves_count, leaves); return 0; rollback: net_shaper_rollback(binding); return ret; } static int net_shaper_pre_del_node(struct net_shaper_binding *binding, const struct net_shaper *shaper, struct netlink_ext_ack *extack) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); struct net_shaper *cur, *leaves, node = {}; int ret, leaves_count = 0; unsigned long index; bool update_node; if (!shaper->leaves) return 0; /* Fetch the new node information. */ node.handle = shaper->parent; cur = net_shaper_lookup(binding, &node.handle); if (cur) { node = *cur; } else { /* A scope NODE shaper can be nested only to the NETDEV scope * shaper without creating the latter, this check may fail only * if the data is in inconsistent status. */ if (WARN_ON_ONCE(node.handle.scope != NET_SHAPER_SCOPE_NETDEV)) return -EINVAL; } leaves = kcalloc(shaper->leaves, sizeof(struct net_shaper), GFP_KERNEL); if (!leaves) return -ENOMEM; /* Build the leaves arrays. */ xa_for_each(&hierarchy->shapers, index, cur) { if (net_shaper_handle_cmp(&cur->parent, &shaper->handle)) continue; if (WARN_ON_ONCE(leaves_count == shaper->leaves)) { ret = -EINVAL; goto free; } leaves[leaves_count++] = *cur; } /* When re-linking to the netdev shaper, avoid the eventual, implicit, * creation of the new node, would be surprising since the user is * doing a delete operation. */ update_node = node.handle.scope != NET_SHAPER_SCOPE_NETDEV; ret = __net_shaper_group(binding, update_node, leaves_count, leaves, &node, extack); free: kfree(leaves); return ret; } int net_shaper_nl_delete_doit(struct sk_buff *skb, struct genl_info *info) { struct net_shaper_hierarchy *hierarchy; struct net_shaper_binding *binding; struct net_shaper_handle handle; struct net_shaper *shaper; int ret; if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_HANDLE)) return -EINVAL; binding = net_shaper_binding_from_ctx(info->ctx); net_shaper_lock(binding); ret = net_shaper_parse_handle(info->attrs[NET_SHAPER_A_HANDLE], info, &handle); if (ret) goto unlock; hierarchy = net_shaper_hierarchy(binding); if (!hierarchy) { ret = -ENOENT; goto unlock; } shaper = net_shaper_lookup(binding, &handle); if (!shaper) { ret = -ENOENT; goto unlock; } if (handle.scope == NET_SHAPER_SCOPE_NODE) { ret = net_shaper_pre_del_node(binding, shaper, info->extack); if (ret) goto unlock; } ret = __net_shaper_delete(binding, shaper, info->extack); unlock: net_shaper_unlock(binding); return ret; } static int net_shaper_group_send_reply(struct net_shaper_binding *binding, const struct net_shaper_handle *handle, struct genl_info *info, struct sk_buff *msg) { void *hdr; hdr = genlmsg_iput(msg, info); if (!hdr) goto free_msg; if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_IFINDEX) || net_shaper_fill_handle(msg, handle, NET_SHAPER_A_HANDLE)) goto free_msg; genlmsg_end(msg, hdr); return genlmsg_reply(msg, info); free_msg: /* Should never happen as msg is pre-allocated with enough space. */ WARN_ONCE(true, "calculated message payload length (%d)", net_shaper_handle_size()); nlmsg_free(msg); return -EMSGSIZE; } int net_shaper_nl_group_doit(struct sk_buff *skb, struct genl_info *info) { struct net_shaper **old_nodes, *leaves, node = {}; struct net_shaper_hierarchy *hierarchy; struct net_shaper_binding *binding; int i, ret, rem, leaves_count; int old_nodes_count = 0; struct sk_buff *msg; struct nlattr *attr; if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_LEAVES)) return -EINVAL; binding = net_shaper_binding_from_ctx(info->ctx); /* The group operation is optional. */ if (!net_shaper_ops(binding)->group) return -EOPNOTSUPP; net_shaper_lock(binding); leaves_count = net_shaper_list_len(info, NET_SHAPER_A_LEAVES); if (!leaves_count) { NL_SET_BAD_ATTR(info->extack, info->attrs[NET_SHAPER_A_LEAVES]); ret = -EINVAL; goto unlock; } leaves = kcalloc(leaves_count, sizeof(struct net_shaper) + sizeof(struct net_shaper *), GFP_KERNEL); if (!leaves) { ret = -ENOMEM; goto unlock; } old_nodes = (void *)&leaves[leaves_count]; ret = net_shaper_parse_node(binding, info->attrs, info, &node); if (ret) goto free_leaves; i = 0; nla_for_each_attr_type(attr, NET_SHAPER_A_LEAVES, genlmsg_data(info->genlhdr), genlmsg_len(info->genlhdr), rem) { if (WARN_ON_ONCE(i >= leaves_count)) goto free_leaves; ret = net_shaper_parse_leaf(binding, attr, info, &node, &leaves[i]); if (ret) goto free_leaves; i++; } /* Prepare the msg reply in advance, to avoid device operation * rollback on allocation failure. */ msg = genlmsg_new(net_shaper_handle_size(), GFP_KERNEL); if (!msg) goto free_leaves; hierarchy = net_shaper_hierarchy_setup(binding); if (!hierarchy) { ret = -ENOMEM; goto free_msg; } /* Record the node shapers that this group() operation can make * childless for later cleanup. */ for (i = 0; i < leaves_count; i++) { if (leaves[i].parent.scope == NET_SHAPER_SCOPE_NODE && net_shaper_handle_cmp(&leaves[i].parent, &node.handle)) { struct net_shaper *tmp; tmp = net_shaper_lookup(binding, &leaves[i].parent); if (!tmp) continue; old_nodes[old_nodes_count++] = tmp; } } ret = __net_shaper_group(binding, true, leaves_count, leaves, &node, info->extack); if (ret) goto free_msg; /* Check if we need to delete any node left alone by the new leaves * linkage. */ for (i = 0; i < old_nodes_count; ++i) { struct net_shaper *tmp = old_nodes[i]; if (--tmp->leaves > 0) continue; /* Errors here are not fatal: the grouping operation is * completed, and user-space can still explicitly clean-up * left-over nodes. */ __net_shaper_delete(binding, tmp, info->extack); } ret = net_shaper_group_send_reply(binding, &node.handle, info, msg); if (ret) GENL_SET_ERR_MSG_FMT(info, "Can't send reply"); free_leaves: kfree(leaves); unlock: net_shaper_unlock(binding); return ret; free_msg: kfree_skb(msg); goto free_leaves; } static int net_shaper_cap_fill_one(struct sk_buff *msg, struct net_shaper_binding *binding, enum net_shaper_scope scope, unsigned long flags, const struct genl_info *info) { unsigned long cur; void *hdr; hdr = genlmsg_iput(msg, info); if (!hdr) return -EMSGSIZE; if (net_shaper_fill_binding(msg, binding, NET_SHAPER_A_CAPS_IFINDEX) || nla_put_u32(msg, NET_SHAPER_A_CAPS_SCOPE, scope)) goto nla_put_failure; for (cur = NET_SHAPER_A_CAPS_SUPPORT_METRIC_BPS; cur <= NET_SHAPER_A_CAPS_MAX; ++cur) { if (flags & BIT(cur) && nla_put_flag(msg, cur)) goto nla_put_failure; } genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int net_shaper_nl_cap_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net_shaper_binding *binding; const struct net_shaper_ops *ops; enum net_shaper_scope scope; unsigned long flags = 0; struct sk_buff *msg; int ret; if (GENL_REQ_ATTR_CHECK(info, NET_SHAPER_A_CAPS_SCOPE)) return -EINVAL; binding = net_shaper_binding_from_ctx(info->ctx); scope = nla_get_u32(info->attrs[NET_SHAPER_A_CAPS_SCOPE]); ops = net_shaper_ops(binding); ops->capabilities(binding, scope, &flags); if (!flags) return -EOPNOTSUPP; msg = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = net_shaper_cap_fill_one(msg, binding, scope, flags, info); if (ret) goto free_msg; ret = genlmsg_reply(msg, info); if (ret) goto free_msg; return 0; free_msg: nlmsg_free(msg); return ret; } int net_shaper_nl_cap_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_info *info = genl_info_dump(cb); struct net_shaper_binding *binding; const struct net_shaper_ops *ops; enum net_shaper_scope scope; int ret; binding = net_shaper_binding_from_ctx(cb->ctx); ops = net_shaper_ops(binding); for (scope = 0; scope <= NET_SHAPER_SCOPE_MAX; ++scope) { unsigned long flags = 0; ops->capabilities(binding, scope, &flags); if (!flags) continue; ret = net_shaper_cap_fill_one(skb, binding, scope, flags, info); if (ret) return ret; } return 0; } static void net_shaper_flush(struct net_shaper_binding *binding) { struct net_shaper_hierarchy *hierarchy = net_shaper_hierarchy(binding); struct net_shaper *cur; unsigned long index; if (!hierarchy) return; net_shaper_lock(binding); xa_lock(&hierarchy->shapers); xa_for_each(&hierarchy->shapers, index, cur) { __xa_erase(&hierarchy->shapers, index); kfree(cur); } xa_unlock(&hierarchy->shapers); net_shaper_unlock(binding); kfree(hierarchy); } void net_shaper_flush_netdev(struct net_device *dev) { struct net_shaper_binding binding = { .type = NET_SHAPER_BINDING_TYPE_NETDEV, .netdev = dev, }; net_shaper_flush(&binding); } void net_shaper_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { struct net_shaper_hierarchy *hierarchy; struct net_shaper_binding binding; int i; binding.type = NET_SHAPER_BINDING_TYPE_NETDEV; binding.netdev = dev; hierarchy = net_shaper_hierarchy(&binding); if (!hierarchy) return; /* Only drivers implementing shapers support ensure * the lock is acquired in advance. */ netdev_assert_locked(dev); /* Take action only when decreasing the tx queue number. */ for (i = txq; i < dev->real_num_tx_queues; ++i) { struct net_shaper_handle handle, parent_handle; struct net_shaper *shaper; u32 index; handle.scope = NET_SHAPER_SCOPE_QUEUE; handle.id = i; shaper = net_shaper_lookup(&binding, &handle); if (!shaper) continue; /* Don't touch the H/W for the queue shaper, the drivers already * deleted the queue and related resources. */ parent_handle = shaper->parent; index = net_shaper_handle_to_index(&handle); xa_erase(&hierarchy->shapers, index); kfree_rcu(shaper, rcu); /* The recursion on parent does the full job. */ if (parent_handle.scope != NET_SHAPER_SCOPE_NODE) continue; shaper = net_shaper_lookup(&binding, &parent_handle); if (shaper && !--shaper->leaves) __net_shaper_delete(&binding, shaper, NULL); } } static int __init shaper_init(void) { return genl_register_family(&net_shaper_nl_family); } subsys_initcall(shaper_init); |
| 9 1 1 8 9 3 6 17 17 17 3669 3675 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/lib/cmdline.c * Helper functions generally used for parsing kernel command line * and module options. * * Code and copyrights come from init/main.c and arch/i386/kernel/setup.c. * * GNU Indent formatting options for this file: -kr -i8 -npsl -pcs */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/ctype.h> /* * If a hyphen was found in get_option, this will handle the * range of numbers, M-N. This will expand the range and insert * the values[M, M+1, ..., N] into the ints array in get_options. */ static int get_range(char **str, int *pint, int n) { int x, inc_counter, upper_range; (*str)++; upper_range = simple_strtol((*str), NULL, 0); inc_counter = upper_range - *pint; for (x = *pint; n && x < upper_range; x++, n--) *pint++ = x; return inc_counter; } /** * get_option - Parse integer from an option string * @str: option string * @pint: (optional output) integer value parsed from @str * * Read an int from an option string; if available accept a subsequent * comma as well. * * When @pint is NULL the function can be used as a validator of * the current option in the string. * * Return values: * 0 - no int in string * 1 - int found, no subsequent comma * 2 - int found including a subsequent comma * 3 - hyphen found to denote a range * * Leading hyphen without integer is no integer case, but we consume it * for the sake of simplification. */ int get_option(char **str, int *pint) { char *cur = *str; int value; if (!cur || !(*cur)) return 0; if (*cur == '-') value = -simple_strtoull(++cur, str, 0); else value = simple_strtoull(cur, str, 0); if (pint) *pint = value; if (cur == *str) return 0; if (**str == ',') { (*str)++; return 2; } if (**str == '-') return 3; return 1; } EXPORT_SYMBOL(get_option); /** * get_options - Parse a string into a list of integers * @str: String to be parsed * @nints: size of integer array * @ints: integer array (must have room for at least one element) * * This function parses a string containing a comma-separated * list of integers, a hyphen-separated range of _positive_ integers, * or a combination of both. The parse halts when the array is * full, or when no more numbers can be retrieved from the * string. * * When @nints is 0, the function just validates the given @str and * returns the amount of parseable integers as described below. * * Returns: * * The first element is filled by the number of collected integers * in the range. The rest is what was parsed from the @str. * * Return value is the character in the string which caused * the parse to end (typically a null terminator, if @str is * completely parseable). */ char *get_options(const char *str, int nints, int *ints) { bool validate = (nints == 0); int res, i = 1; while (i < nints || validate) { int *pint = validate ? ints : ints + i; res = get_option((char **)&str, pint); if (res == 0) break; if (res == 3) { int n = validate ? 0 : nints - i; int range_nums; range_nums = get_range((char **)&str, pint, n); if (range_nums < 0) break; /* * Decrement the result by one to leave out the * last number in the range. The next iteration * will handle the upper number in the range */ i += (range_nums - 1); } i++; if (res == 1) break; } ints[0] = i - 1; return (char *)str; } EXPORT_SYMBOL(get_options); /** * memparse - parse a string with mem suffixes into a number * @ptr: Where parse begins * @retptr: (output) Optional pointer to next char after parse completes * * Parses a string into a number. The number stored at @ptr is * potentially suffixed with K, M, G, T, P, E. */ unsigned long long memparse(const char *ptr, char **retptr) { char *endptr; /* local pointer to end of parsed string */ unsigned long long ret = simple_strtoull(ptr, &endptr, 0); switch (*endptr) { case 'E': case 'e': ret <<= 10; fallthrough; case 'P': case 'p': ret <<= 10; fallthrough; case 'T': case 't': ret <<= 10; fallthrough; case 'G': case 'g': ret <<= 10; fallthrough; case 'M': case 'm': ret <<= 10; fallthrough; case 'K': case 'k': ret <<= 10; endptr++; fallthrough; default: break; } if (retptr) *retptr = endptr; return ret; } EXPORT_SYMBOL(memparse); /** * parse_option_str - Parse a string and check an option is set or not * @str: String to be parsed * @option: option name * * This function parses a string containing a comma-separated list of * strings like a=b,c. * * Return true if there's such option in the string, or return false. */ bool parse_option_str(const char *str, const char *option) { while (*str) { if (!strncmp(str, option, strlen(option))) { str += strlen(option); if (!*str || *str == ',') return true; } while (*str && *str != ',') str++; if (*str == ',') str++; } return false; } /* * Parse a string to get a param value pair. * You can use " around spaces, but can't escape ". * Hyphens and underscores equivalent in parameter names. */ char *next_arg(char *args, char **param, char **val) { unsigned int i, equals = 0; int in_quote = 0, quoted = 0; if (*args == '"') { args++; in_quote = 1; quoted = 1; } for (i = 0; args[i]; i++) { if (isspace(args[i]) && !in_quote) break; if (equals == 0) { if (args[i] == '=') equals = i; } if (args[i] == '"') in_quote = !in_quote; } *param = args; if (!equals) *val = NULL; else { args[equals] = '\0'; *val = args + equals + 1; /* Don't include quotes in value. */ if (**val == '"') { (*val)++; if (args[i-1] == '"') args[i-1] = '\0'; } } if (quoted && i > 0 && args[i-1] == '"') args[i-1] = '\0'; if (args[i]) { args[i] = '\0'; args += i + 1; } else args += i; /* Chew up trailing spaces. */ return skip_spaces(args); } EXPORT_SYMBOL(next_arg); |
| 12 12 12 12 12 12 11 11 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 205 12 193 193 194 194 194 206 205 106 206 206 12 193 14 204 206 126 191 191 123 7 123 19 7 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 | // SPDX-License-Identifier: MIT /* * Copyright 2018 Noralf Trønnes * Copyright (c) 2006-2009 Red Hat Inc. * Copyright (c) 2006-2008 Intel Corporation * Jesse Barnes <jesse.barnes@intel.com> * Copyright (c) 2007 Dave Airlie <airlied@linux.ie> */ #include "drm/drm_modeset_lock.h" #include <linux/export.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/string_helpers.h> #include <drm/drm_atomic.h> #include <drm/drm_client.h> #include <drm/drm_connector.h> #include <drm/drm_crtc.h> #include <drm/drm_device.h> #include <drm/drm_drv.h> #include <drm/drm_edid.h> #include <drm/drm_encoder.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_internal.h" #define DRM_CLIENT_MAX_CLONED_CONNECTORS 8 struct drm_client_offset { int x, y; }; int drm_client_modeset_create(struct drm_client_dev *client) { struct drm_device *dev = client->dev; unsigned int num_crtc = dev->mode_config.num_crtc; unsigned int max_connector_count = 1; struct drm_mode_set *modeset; struct drm_crtc *crtc; int i = 0; /* Add terminating zero entry to enable index less iteration */ client->modesets = kcalloc(num_crtc + 1, sizeof(*client->modesets), GFP_KERNEL); if (!client->modesets) return -ENOMEM; mutex_init(&client->modeset_mutex); drm_for_each_crtc(crtc, dev) client->modesets[i++].crtc = crtc; /* Cloning is only supported in the single crtc case. */ if (num_crtc == 1) max_connector_count = DRM_CLIENT_MAX_CLONED_CONNECTORS; for (modeset = client->modesets; modeset->crtc; modeset++) { modeset->connectors = kcalloc(max_connector_count, sizeof(*modeset->connectors), GFP_KERNEL); if (!modeset->connectors) goto err_free; } return 0; err_free: drm_client_modeset_free(client); return -ENOMEM; } static void drm_client_modeset_release(struct drm_client_dev *client) { struct drm_mode_set *modeset; drm_client_for_each_modeset(modeset, client) { int i; drm_mode_destroy(client->dev, modeset->mode); modeset->mode = NULL; modeset->fb = NULL; for (i = 0; i < modeset->num_connectors; i++) { drm_connector_put(modeset->connectors[i]); modeset->connectors[i] = NULL; } modeset->num_connectors = 0; } } void drm_client_modeset_free(struct drm_client_dev *client) { struct drm_mode_set *modeset; mutex_lock(&client->modeset_mutex); drm_client_modeset_release(client); drm_client_for_each_modeset(modeset, client) kfree(modeset->connectors); mutex_unlock(&client->modeset_mutex); mutex_destroy(&client->modeset_mutex); kfree(client->modesets); } static struct drm_mode_set * drm_client_find_modeset(struct drm_client_dev *client, struct drm_crtc *crtc) { struct drm_mode_set *modeset; drm_client_for_each_modeset(modeset, client) if (modeset->crtc == crtc) return modeset; return NULL; } static const struct drm_display_mode * drm_connector_get_tiled_mode(struct drm_connector *connector) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay == connector->tile_h_size && mode->vdisplay == connector->tile_v_size) return mode; } return NULL; } static const struct drm_display_mode * drm_connector_fallback_non_tiled_mode(struct drm_connector *connector) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay == connector->tile_h_size && mode->vdisplay == connector->tile_v_size) continue; return mode; } return NULL; } static const struct drm_display_mode * drm_connector_preferred_mode(struct drm_connector *connector, int width, int height) { const struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay > width || mode->vdisplay > height) continue; if (mode->type & DRM_MODE_TYPE_PREFERRED) return mode; } return NULL; } static const struct drm_display_mode * drm_connector_first_mode(struct drm_connector *connector) { return list_first_entry_or_null(&connector->modes, struct drm_display_mode, head); } static const struct drm_display_mode * drm_connector_pick_cmdline_mode(struct drm_connector *connector) { const struct drm_cmdline_mode *cmdline_mode; const struct drm_display_mode *mode; bool prefer_non_interlace; /* * Find a user-defined mode. If the user gave us a valid * mode on the kernel command line, it will show up in this * list. */ list_for_each_entry(mode, &connector->modes, head) { if (mode->type & DRM_MODE_TYPE_USERDEF) return mode; } cmdline_mode = &connector->cmdline_mode; if (cmdline_mode->specified == false) return NULL; /* * Attempt to find a matching mode in the list of modes we * have gotten so far. */ prefer_non_interlace = !cmdline_mode->interlace; again: list_for_each_entry(mode, &connector->modes, head) { /* check width/height */ if (mode->hdisplay != cmdline_mode->xres || mode->vdisplay != cmdline_mode->yres) continue; if (cmdline_mode->refresh_specified) { if (drm_mode_vrefresh(mode) != cmdline_mode->refresh) continue; } if (cmdline_mode->interlace) { if (!(mode->flags & DRM_MODE_FLAG_INTERLACE)) continue; } else if (prefer_non_interlace) { if (mode->flags & DRM_MODE_FLAG_INTERLACE) continue; } return mode; } if (prefer_non_interlace) { prefer_non_interlace = false; goto again; } return NULL; } static bool drm_connector_enabled(struct drm_connector *connector, bool strict) { bool enable; if (connector->display_info.non_desktop) return false; if (strict) enable = connector->status == connector_status_connected; else enable = connector->status != connector_status_disconnected; return enable; } static void drm_client_connectors_enabled(struct drm_connector *connectors[], unsigned int connector_count, bool enabled[]) { bool any_enabled = false; struct drm_connector *connector; int i = 0; for (i = 0; i < connector_count; i++) { connector = connectors[i]; enabled[i] = drm_connector_enabled(connector, true); drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] enabled? %s\n", connector->base.id, connector->name, connector->display_info.non_desktop ? "non desktop" : str_yes_no(enabled[i])); any_enabled |= enabled[i]; } if (any_enabled) return; for (i = 0; i < connector_count; i++) enabled[i] = drm_connector_enabled(connectors[i], false); } static void mode_replace(struct drm_device *dev, const struct drm_display_mode **dst, const struct drm_display_mode *src) { drm_mode_destroy(dev, (struct drm_display_mode *)*dst); *dst = src ? drm_mode_duplicate(dev, src) : NULL; } static void modes_destroy(struct drm_device *dev, const struct drm_display_mode *modes[], int count) { int i; for (i = 0; i < count; i++) mode_replace(dev, &modes[i], NULL); } static bool drm_client_target_cloned(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { int count, i; bool can_clone = false; struct drm_display_mode *dmt_mode; /* only contemplate cloning in the single crtc case */ if (dev->mode_config.num_crtc > 1) return false; count = 0; for (i = 0; i < connector_count; i++) { if (enabled[i]) count++; } /* only contemplate cloning if more than one connector is enabled */ if (count <= 1) return false; /* check the command line or if nothing common pick 1024x768 */ can_clone = true; for (i = 0; i < connector_count; i++) { int j; if (!enabled[i]) continue; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connectors[i])); if (!modes[i]) { can_clone = false; break; } for (j = 0; j < i; j++) { if (!enabled[j]) continue; if (!drm_mode_match(modes[j], modes[i], DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_CLOCK | DRM_MODE_MATCH_FLAGS | DRM_MODE_MATCH_3D_FLAGS)) can_clone = false; } } if (can_clone) { drm_dbg_kms(dev, "can clone using command line\n"); return true; } /* try and find a 1024x768 mode on each connector */ can_clone = true; dmt_mode = drm_mode_find_dmt(dev, 1024, 768, 60, false); if (!dmt_mode) goto fail; for (i = 0; i < connector_count; i++) { const struct drm_display_mode *mode; if (!enabled[i]) continue; list_for_each_entry(mode, &connectors[i]->modes, head) { if (drm_mode_match(mode, dmt_mode, DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_CLOCK | DRM_MODE_MATCH_FLAGS | DRM_MODE_MATCH_3D_FLAGS)) mode_replace(dev, &modes[i], mode); } if (!modes[i]) can_clone = false; } drm_mode_destroy(dev, dmt_mode); if (can_clone) { drm_dbg_kms(dev, "can clone using 1024x768\n"); return true; } fail: drm_info(dev, "kms: can't enable cloning when we probably wanted to.\n"); return false; } static int drm_client_get_tile_offsets(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], int idx, int h_idx, int v_idx) { int i; int hoffset = 0, voffset = 0; for (i = 0; i < connector_count; i++) { struct drm_connector *connector = connectors[i]; if (!connector->has_tile) continue; if (!modes[i] && (h_idx || v_idx)) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] no modes for connector tiled %d\n", connector->base.id, connector->name, i); continue; } if (connector->tile_h_loc < h_idx) hoffset += modes[i]->hdisplay; if (connector->tile_v_loc < v_idx) voffset += modes[i]->vdisplay; } offsets[idx].x = hoffset; offsets[idx].y = voffset; drm_dbg_kms(dev, "returned %d %d for %d %d\n", hoffset, voffset, h_idx, v_idx); return 0; } static bool drm_client_target_preferred(struct drm_device *dev, struct drm_connector *connectors[], unsigned int connector_count, const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { const u64 mask = BIT_ULL(connector_count) - 1; u64 conn_configured = 0; int tile_pass = 0; int num_tiled_conns = 0; int i; for (i = 0; i < connector_count; i++) { if (connectors[i]->has_tile && connectors[i]->status == connector_status_connected) num_tiled_conns++; } retry: for (i = 0; i < connector_count; i++) { struct drm_connector *connector = connectors[i]; const char *mode_type; if (conn_configured & BIT_ULL(i)) continue; if (enabled[i] == false) { conn_configured |= BIT_ULL(i); continue; } /* first pass over all the untiled connectors */ if (tile_pass == 0 && connector->has_tile) continue; if (tile_pass == 1) { if (connector->tile_h_loc != 0 || connector->tile_v_loc != 0) continue; } else { if (connector->tile_h_loc != tile_pass - 1 && connector->tile_v_loc != tile_pass - 1) /* if this tile_pass doesn't cover any of the tiles - keep going */ continue; /* * find the tile offsets for this pass - need to find * all tiles left and above */ drm_client_get_tile_offsets(dev, connectors, connector_count, modes, offsets, i, connector->tile_h_loc, connector->tile_v_loc); } mode_type = "cmdline"; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connector)); if (!modes[i]) { mode_type = "preferred"; mode_replace(dev, &modes[i], drm_connector_preferred_mode(connector, width, height)); } if (!modes[i]) { mode_type = "first"; mode_replace(dev, &modes[i], drm_connector_first_mode(connector)); } /* * In case of tiled mode if all tiles not present fallback to * first available non tiled mode. * After all tiles are present, try to find the tiled mode * for all and if tiled mode not present due to fbcon size * limitations, use first non tiled mode only for * tile 0,0 and set to no mode for all other tiles. */ if (connector->has_tile) { if (num_tiled_conns < connector->num_h_tile * connector->num_v_tile || (connector->tile_h_loc == 0 && connector->tile_v_loc == 0 && !drm_connector_get_tiled_mode(connector))) { mode_type = "non tiled"; mode_replace(dev, &modes[i], drm_connector_fallback_non_tiled_mode(connector)); } else { mode_type = "tiled"; mode_replace(dev, &modes[i], drm_connector_get_tiled_mode(connector)); } } if (modes[i]) drm_dbg_kms(dev, "[CONNECTOR:%d:%s] found %s mode: %s\n", connector->base.id, connector->name, mode_type, modes[i]->name); else drm_dbg_kms(dev, "[CONNECTOR:%d:%s] no mode found\n", connector->base.id, connector->name); conn_configured |= BIT_ULL(i); } if ((conn_configured & mask) != mask) { tile_pass++; goto retry; } return true; } static bool connector_has_possible_crtc(struct drm_connector *connector, struct drm_crtc *crtc) { struct drm_encoder *encoder; drm_connector_for_each_possible_encoder(connector, encoder) { if (encoder->possible_crtcs & drm_crtc_mask(crtc)) return true; } return false; } static int drm_client_pick_crtcs(struct drm_client_dev *client, struct drm_connector *connectors[], unsigned int connector_count, struct drm_crtc *best_crtcs[], const struct drm_display_mode *modes[], int n, int width, int height) { struct drm_device *dev = client->dev; struct drm_connector *connector; int my_score, best_score, score; struct drm_crtc **crtcs; struct drm_mode_set *modeset; if (n == connector_count) return 0; connector = connectors[n]; best_crtcs[n] = NULL; best_score = drm_client_pick_crtcs(client, connectors, connector_count, best_crtcs, modes, n + 1, width, height); if (modes[n] == NULL) return best_score; crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL); if (!crtcs) return best_score; my_score = 1; if (connector->status == connector_status_connected) my_score++; if (connector->cmdline_mode.specified) my_score++; if (drm_connector_preferred_mode(connector, width, height)) my_score++; /* * select a crtc for this connector and then attempt to configure * remaining connectors */ drm_client_for_each_modeset(modeset, client) { struct drm_crtc *crtc = modeset->crtc; int o; if (!connector_has_possible_crtc(connector, crtc)) continue; for (o = 0; o < n; o++) if (best_crtcs[o] == crtc) break; if (o < n) { /* ignore cloning unless only a single crtc */ if (dev->mode_config.num_crtc > 1) continue; if (!drm_mode_equal(modes[o], modes[n])) continue; } crtcs[n] = crtc; memcpy(crtcs, best_crtcs, n * sizeof(*crtcs)); score = my_score + drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, n + 1, width, height); if (score > best_score) { best_score = score; memcpy(best_crtcs, crtcs, connector_count * sizeof(*crtcs)); } } kfree(crtcs); return best_score; } /* Try to read the BIOS display configuration and use it for the initial config */ static bool drm_client_firmware_config(struct drm_client_dev *client, struct drm_connector *connectors[], unsigned int connector_count, struct drm_crtc *crtcs[], const struct drm_display_mode *modes[], struct drm_client_offset offsets[], bool enabled[], int width, int height) { const int count = min_t(unsigned int, connector_count, BITS_PER_LONG); unsigned long conn_configured, conn_seq, mask; struct drm_device *dev = client->dev; int i; bool *save_enabled; bool fallback = true, ret = true; int num_connectors_enabled = 0; int num_connectors_detected = 0; int num_tiled_conns = 0; struct drm_modeset_acquire_ctx ctx; if (!drm_drv_uses_atomic_modeset(dev)) return false; if (drm_WARN_ON(dev, count <= 0)) return false; save_enabled = kcalloc(count, sizeof(bool), GFP_KERNEL); if (!save_enabled) return false; drm_modeset_acquire_init(&ctx, 0); while (drm_modeset_lock_all_ctx(dev, &ctx) != 0) drm_modeset_backoff(&ctx); memcpy(save_enabled, enabled, count); mask = GENMASK(count - 1, 0); conn_configured = 0; for (i = 0; i < count; i++) { if (connectors[i]->has_tile && connectors[i]->status == connector_status_connected) num_tiled_conns++; } retry: conn_seq = conn_configured; for (i = 0; i < count; i++) { struct drm_connector *connector = connectors[i]; struct drm_encoder *encoder; struct drm_crtc *crtc; const char *mode_type; int j; if (conn_configured & BIT(i)) continue; if (conn_seq == 0 && !connector->has_tile) continue; if (connector->status == connector_status_connected) num_connectors_detected++; if (!enabled[i]) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] not enabled, skipping\n", connector->base.id, connector->name); conn_configured |= BIT(i); continue; } if (connector->force == DRM_FORCE_OFF) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] disabled by user, skipping\n", connector->base.id, connector->name); enabled[i] = false; continue; } encoder = connector->state->best_encoder; if (!encoder || drm_WARN_ON(dev, !connector->state->crtc)) { if (connector->force > DRM_FORCE_OFF) goto bail; drm_dbg_kms(dev, "[CONNECTOR:%d:%s] has no encoder or crtc, skipping\n", connector->base.id, connector->name); enabled[i] = false; conn_configured |= BIT(i); continue; } num_connectors_enabled++; crtc = connector->state->crtc; /* * Make sure we're not trying to drive multiple connectors * with a single CRTC, since our cloning support may not * match the BIOS. */ for (j = 0; j < count; j++) { if (crtcs[j] == crtc) { drm_dbg_kms(dev, "[CONNECTOR:%d:%s] fallback: cloned configuration\n", connector->base.id, connector->name); goto bail; } } mode_type = "cmdline"; mode_replace(dev, &modes[i], drm_connector_pick_cmdline_mode(connector)); if (!modes[i]) { mode_type = "preferred"; mode_replace(dev, &modes[i], drm_connector_preferred_mode(connector, width, height)); } if (!modes[i]) { mode_type = "first"; mode_replace(dev, &modes[i], drm_connector_first_mode(connector)); } /* last resort: use current mode */ if (!modes[i]) { mode_type = "current"; mode_replace(dev, &modes[i], &crtc->state->mode); } /* * In case of tiled modes, if all tiles are not present * then fallback to a non tiled mode. */ if (connector->has_tile && num_tiled_conns < connector->num_h_tile * connector->num_v_tile) { mode_type = "non tiled"; mode_replace(dev, &modes[i], drm_connector_fallback_non_tiled_mode(connector)); } crtcs[i] = crtc; drm_dbg_kms(dev, "[CONNECTOR::%d:%s] on [CRTC:%d:%s] using %s mode: %s\n", connector->base.id, connector->name, crtc->base.id, crtc->name, mode_type, modes[i]->name); fallback = false; conn_configured |= BIT(i); } if ((conn_configured & mask) != mask && conn_configured != conn_seq) goto retry; for (i = 0; i < count; i++) { struct drm_connector *connector = connectors[i]; if (connector->has_tile) drm_client_get_tile_offsets(dev, connectors, connector_count, modes, offsets, i, connector->tile_h_loc, connector->tile_v_loc); } /* * If the BIOS didn't enable everything it could, fall back to have the * same user experiencing of lighting up as much as possible like the * fbdev helper library. */ if (num_connectors_enabled != num_connectors_detected && num_connectors_enabled < dev->mode_config.num_crtc) { drm_dbg_kms(dev, "fallback: Not all outputs enabled\n"); drm_dbg_kms(dev, "Enabled: %i, detected: %i\n", num_connectors_enabled, num_connectors_detected); fallback = true; } if (fallback) { bail: drm_dbg_kms(dev, "Not using firmware configuration\n"); memcpy(enabled, save_enabled, count); ret = false; } drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); kfree(save_enabled); return ret; } /** * drm_client_modeset_probe() - Probe for displays * @client: DRM client * @width: Maximum display mode width (optional) * @height: Maximum display mode height (optional) * * This function sets up display pipelines for enabled connectors and stores the * config in the client's modeset array. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, unsigned int height) { struct drm_connector *connector, **connectors = NULL; struct drm_connector_list_iter conn_iter; struct drm_device *dev = client->dev; unsigned int total_modes_count = 0; struct drm_client_offset *offsets; unsigned int connector_count = 0; const struct drm_display_mode **modes; struct drm_crtc **crtcs; int i, ret = 0; bool *enabled; drm_dbg_kms(dev, "\n"); if (!width) width = dev->mode_config.max_width; if (!height) height = dev->mode_config.max_height; drm_connector_list_iter_begin(dev, &conn_iter); drm_client_for_each_connector_iter(connector, &conn_iter) { struct drm_connector **tmp; tmp = krealloc(connectors, (connector_count + 1) * sizeof(*connectors), GFP_KERNEL); if (!tmp) { ret = -ENOMEM; goto free_connectors; } connectors = tmp; drm_connector_get(connector); connectors[connector_count++] = connector; } drm_connector_list_iter_end(&conn_iter); if (!connector_count) return 0; crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL); modes = kcalloc(connector_count, sizeof(*modes), GFP_KERNEL); offsets = kcalloc(connector_count, sizeof(*offsets), GFP_KERNEL); enabled = kcalloc(connector_count, sizeof(bool), GFP_KERNEL); if (!crtcs || !modes || !enabled || !offsets) { ret = -ENOMEM; goto out; } mutex_lock(&client->modeset_mutex); mutex_lock(&dev->mode_config.mutex); for (i = 0; i < connector_count; i++) total_modes_count += connectors[i]->funcs->fill_modes(connectors[i], width, height); if (!total_modes_count) drm_dbg_kms(dev, "No connectors reported connected with modes\n"); drm_client_connectors_enabled(connectors, connector_count, enabled); if (!drm_client_firmware_config(client, connectors, connector_count, crtcs, modes, offsets, enabled, width, height)) { modes_destroy(dev, modes, connector_count); memset(crtcs, 0, connector_count * sizeof(*crtcs)); memset(offsets, 0, connector_count * sizeof(*offsets)); if (!drm_client_target_cloned(dev, connectors, connector_count, modes, offsets, enabled, width, height) && !drm_client_target_preferred(dev, connectors, connector_count, modes, offsets, enabled, width, height)) drm_err(dev, "Unable to find initial modes\n"); drm_dbg_kms(dev, "picking CRTCs for %dx%d config\n", width, height); drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, 0, width, height); } mutex_unlock(&dev->mode_config.mutex); drm_client_modeset_release(client); for (i = 0; i < connector_count; i++) { const struct drm_display_mode *mode = modes[i]; struct drm_crtc *crtc = crtcs[i]; struct drm_client_offset *offset = &offsets[i]; if (mode && crtc) { struct drm_mode_set *modeset = drm_client_find_modeset(client, crtc); struct drm_connector *connector = connectors[i]; drm_dbg_kms(dev, "[CRTC:%d:%s] desired mode %s set (%d,%d)\n", crtc->base.id, crtc->name, mode->name, offset->x, offset->y); if (drm_WARN_ON_ONCE(dev, modeset->num_connectors == DRM_CLIENT_MAX_CLONED_CONNECTORS || (dev->mode_config.num_crtc > 1 && modeset->num_connectors == 1))) { ret = -EINVAL; break; } drm_mode_destroy(dev, modeset->mode); modeset->mode = drm_mode_duplicate(dev, mode); if (!modeset->mode) { ret = -ENOMEM; break; } drm_connector_get(connector); modeset->connectors[modeset->num_connectors++] = connector; modeset->x = offset->x; modeset->y = offset->y; } } mutex_unlock(&client->modeset_mutex); out: kfree(crtcs); modes_destroy(dev, modes, connector_count); kfree(modes); kfree(offsets); kfree(enabled); free_connectors: for (i = 0; i < connector_count; i++) drm_connector_put(connectors[i]); kfree(connectors); return ret; } EXPORT_SYMBOL(drm_client_modeset_probe); /** * drm_client_rotation() - Check the initial rotation value * @modeset: DRM modeset * @rotation: Returned rotation value * * This function checks if the primary plane in @modeset can hw rotate * to match the rotation needed on its connector. * * Note: Currently only 0 and 180 degrees are supported. * * Return: * True if the plane can do the rotation, false otherwise. */ bool drm_client_rotation(struct drm_mode_set *modeset, unsigned int *rotation) { struct drm_connector *connector = modeset->connectors[0]; struct drm_plane *plane = modeset->crtc->primary; struct drm_cmdline_mode *cmdline; u64 valid_mask = 0; int i; if (!modeset->num_connectors) return false; switch (connector->display_info.panel_orientation) { case DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP: *rotation = DRM_MODE_ROTATE_180; break; case DRM_MODE_PANEL_ORIENTATION_LEFT_UP: *rotation = DRM_MODE_ROTATE_90; break; case DRM_MODE_PANEL_ORIENTATION_RIGHT_UP: *rotation = DRM_MODE_ROTATE_270; break; default: *rotation = DRM_MODE_ROTATE_0; } /** * The panel already defined the default rotation * through its orientation. Whatever has been provided * on the command line needs to be added to that. * * Unfortunately, the rotations are at different bit * indices, so the math to add them up are not as * trivial as they could. * * Reflections on the other hand are pretty trivial to deal with, a * simple XOR between the two handle the addition nicely. */ cmdline = &connector->cmdline_mode; if (cmdline->specified && cmdline->rotation_reflection) { unsigned int cmdline_rest, panel_rest; unsigned int cmdline_rot, panel_rot; unsigned int sum_rot, sum_rest; panel_rot = ilog2(*rotation & DRM_MODE_ROTATE_MASK); cmdline_rot = ilog2(cmdline->rotation_reflection & DRM_MODE_ROTATE_MASK); sum_rot = (panel_rot + cmdline_rot) % 4; panel_rest = *rotation & ~DRM_MODE_ROTATE_MASK; cmdline_rest = cmdline->rotation_reflection & ~DRM_MODE_ROTATE_MASK; sum_rest = panel_rest ^ cmdline_rest; *rotation = (1 << sum_rot) | sum_rest; } /* * TODO: support 90 / 270 degree hardware rotation, * depending on the hardware this may require the framebuffer * to be in a specific tiling format. */ if (((*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_0 && (*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_180) || !plane->rotation_property) return false; for (i = 0; i < plane->rotation_property->num_values; i++) valid_mask |= (1ULL << plane->rotation_property->values[i]); if (!(*rotation & valid_mask)) return false; return true; } EXPORT_SYMBOL(drm_client_rotation); static int drm_client_modeset_commit_atomic(struct drm_client_dev *client, bool active, bool check) { struct drm_device *dev = client->dev; struct drm_plane *plane; struct drm_atomic_state *state; struct drm_modeset_acquire_ctx ctx; struct drm_mode_set *mode_set; int ret; drm_modeset_acquire_init(&ctx, 0); state = drm_atomic_state_alloc(dev); if (!state) { ret = -ENOMEM; goto out_ctx; } state->acquire_ctx = &ctx; retry: drm_for_each_plane(plane, dev) { struct drm_plane_state *plane_state; plane_state = drm_atomic_get_plane_state(state, plane); if (IS_ERR(plane_state)) { ret = PTR_ERR(plane_state); goto out_state; } plane_state->rotation = DRM_MODE_ROTATE_0; /* disable non-primary: */ if (plane->type == DRM_PLANE_TYPE_PRIMARY) continue; ret = __drm_atomic_helper_disable_plane(plane, plane_state); if (ret != 0) goto out_state; } drm_client_for_each_modeset(mode_set, client) { struct drm_plane *primary = mode_set->crtc->primary; unsigned int rotation; if (drm_client_rotation(mode_set, &rotation)) { struct drm_plane_state *plane_state; /* Cannot fail as we've already gotten the plane state above */ plane_state = drm_atomic_get_new_plane_state(state, primary); plane_state->rotation = rotation; } ret = __drm_atomic_helper_set_config(mode_set, state); if (ret != 0) goto out_state; /* * __drm_atomic_helper_set_config() sets active when a * mode is set, unconditionally clear it if we force DPMS off */ if (!active) { struct drm_crtc *crtc = mode_set->crtc; struct drm_crtc_state *crtc_state = drm_atomic_get_new_crtc_state(state, crtc); crtc_state->active = false; } } if (check) ret = drm_atomic_check_only(state); else ret = drm_atomic_commit(state); out_state: if (ret == -EDEADLK) goto backoff; drm_atomic_state_put(state); out_ctx: drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); return ret; backoff: drm_atomic_state_clear(state); drm_modeset_backoff(&ctx); goto retry; } static int drm_client_modeset_commit_legacy(struct drm_client_dev *client) { struct drm_device *dev = client->dev; struct drm_mode_set *mode_set; struct drm_plane *plane; int ret = 0; drm_modeset_lock_all(dev); drm_for_each_plane(plane, dev) { if (plane->type != DRM_PLANE_TYPE_PRIMARY) drm_plane_force_disable(plane); if (plane->rotation_property) drm_mode_plane_set_obj_prop(plane, plane->rotation_property, DRM_MODE_ROTATE_0); } drm_client_for_each_modeset(mode_set, client) { struct drm_crtc *crtc = mode_set->crtc; if (crtc->funcs->cursor_set2) { ret = crtc->funcs->cursor_set2(crtc, NULL, 0, 0, 0, 0, 0); if (ret) goto out; } else if (crtc->funcs->cursor_set) { ret = crtc->funcs->cursor_set(crtc, NULL, 0, 0, 0); if (ret) goto out; } ret = drm_mode_set_config_internal(mode_set); if (ret) goto out; } out: drm_modeset_unlock_all(dev); return ret; } /** * drm_client_modeset_check() - Check modeset configuration * @client: DRM client * * Check modeset configuration. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_check(struct drm_client_dev *client) { int ret; if (!drm_drv_uses_atomic_modeset(client->dev)) return 0; mutex_lock(&client->modeset_mutex); ret = drm_client_modeset_commit_atomic(client, true, true); mutex_unlock(&client->modeset_mutex); return ret; } EXPORT_SYMBOL(drm_client_modeset_check); /** * drm_client_modeset_commit_locked() - Force commit CRTC configuration * @client: DRM client * * Commit modeset configuration to crtcs without checking if there is a DRM * master. The assumption is that the caller already holds an internal DRM * master reference acquired with drm_master_internal_acquire(). * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_commit_locked(struct drm_client_dev *client) { struct drm_device *dev = client->dev; int ret; mutex_lock(&client->modeset_mutex); if (drm_drv_uses_atomic_modeset(dev)) ret = drm_client_modeset_commit_atomic(client, true, false); else ret = drm_client_modeset_commit_legacy(client); mutex_unlock(&client->modeset_mutex); return ret; } EXPORT_SYMBOL(drm_client_modeset_commit_locked); /** * drm_client_modeset_commit() - Commit CRTC configuration * @client: DRM client * * Commit modeset configuration to crtcs. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_commit(struct drm_client_dev *client) { struct drm_device *dev = client->dev; int ret; if (!drm_master_internal_acquire(dev)) return -EBUSY; ret = drm_client_modeset_commit_locked(client); drm_master_internal_release(dev); return ret; } EXPORT_SYMBOL(drm_client_modeset_commit); static void drm_client_modeset_dpms_legacy(struct drm_client_dev *client, int dpms_mode) { struct drm_device *dev = client->dev; struct drm_connector *connector; struct drm_mode_set *modeset; struct drm_modeset_acquire_ctx ctx; int ret; DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, 0, ret); drm_client_for_each_modeset(modeset, client) { int j; if (!modeset->crtc->enabled) continue; for (j = 0; j < modeset->num_connectors; j++) { connector = modeset->connectors[j]; connector->funcs->dpms(connector, dpms_mode); drm_object_property_set_value(&connector->base, dev->mode_config.dpms_property, dpms_mode); } } DRM_MODESET_LOCK_ALL_END(dev, ctx, ret); } /** * drm_client_modeset_dpms() - Set DPMS mode * @client: DRM client * @mode: DPMS mode * * Note: For atomic drivers @mode is reduced to on/off. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_dpms(struct drm_client_dev *client, int mode) { struct drm_device *dev = client->dev; int ret = 0; if (!drm_master_internal_acquire(dev)) return -EBUSY; mutex_lock(&client->modeset_mutex); if (drm_drv_uses_atomic_modeset(dev)) ret = drm_client_modeset_commit_atomic(client, mode == DRM_MODE_DPMS_ON, false); else drm_client_modeset_dpms_legacy(client, mode); mutex_unlock(&client->modeset_mutex); drm_master_internal_release(dev); return ret; } EXPORT_SYMBOL(drm_client_modeset_dpms); #ifdef CONFIG_DRM_KUNIT_TEST #include "tests/drm_client_modeset_test.c" #endif |
| 2 2 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 | // SPDX-License-Identifier: GPL-2.0-or-later /* A driver for the D-Link DSB-R100 USB radio and Gemtek USB Radio 21. * The device plugs into both the USB and an analog audio input, so this thing * only deals with initialisation and frequency setting, the * audio data has to be handled by a sound driver. * * Major issue: I can't find out where the device reports the signal * strength, and indeed the windows software appearantly just looks * at the stereo indicator as well. So, scanning will only find * stereo stations. Sad, but I can't help it. * * Also, the windows program sends oodles of messages over to the * device, and I couldn't figure out their meaning. My suspicion * is that they don't have any:-) * * You might find some interesting stuff about this module at * http://unimut.fsk.uni-heidelberg.de/unimut/demi/dsbr * * Fully tested with the Keene USB FM Transmitter and the v4l2-compliance tool. * * Copyright (c) 2000 Markus Demleitner <msdemlei@cl.uni-heidelberg.de> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/input.h> #include <linux/videodev2.h> #include <linux/usb.h> #include <media/v4l2-device.h> #include <media/v4l2-ioctl.h> #include <media/v4l2-ctrls.h> #include <media/v4l2-event.h> /* * Version Information */ MODULE_AUTHOR("Markus Demleitner <msdemlei@tucana.harvard.edu>"); MODULE_DESCRIPTION("D-Link DSB-R100 USB FM radio driver"); MODULE_LICENSE("GPL"); MODULE_VERSION("1.1.0"); #define DSB100_VENDOR 0x04b4 #define DSB100_PRODUCT 0x1002 /* Commands the device appears to understand */ #define DSB100_TUNE 1 #define DSB100_ONOFF 2 #define TB_LEN 16 /* Frequency limits in MHz -- these are European values. For Japanese devices, that would be 76 and 91. */ #define FREQ_MIN 87.5 #define FREQ_MAX 108.0 #define FREQ_MUL 16000 #define v4l2_dev_to_radio(d) container_of(d, struct dsbr100_device, v4l2_dev) static int radio_nr = -1; module_param(radio_nr, int, 0); /* Data for one (physical) device */ struct dsbr100_device { struct usb_device *usbdev; struct video_device videodev; struct v4l2_device v4l2_dev; struct v4l2_ctrl_handler hdl; u8 *transfer_buffer; struct mutex v4l2_lock; int curfreq; bool stereo; bool muted; }; /* Low-level device interface begins here */ /* set a frequency, freq is defined by v4l's TUNER_LOW, i.e. 1/16th kHz */ static int dsbr100_setfreq(struct dsbr100_device *radio, unsigned freq) { unsigned f = (freq / 16 * 80) / 1000 + 856; int retval = 0; if (!radio->muted) { retval = usb_control_msg(radio->usbdev, usb_rcvctrlpipe(radio->usbdev, 0), DSB100_TUNE, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, (f >> 8) & 0x00ff, f & 0xff, radio->transfer_buffer, 8, 300); if (retval >= 0) mdelay(1); } if (retval >= 0) { radio->curfreq = freq; return 0; } dev_err(&radio->usbdev->dev, "%s - usb_control_msg returned %i, request %i\n", __func__, retval, DSB100_TUNE); return retval; } /* switch on radio */ static int dsbr100_start(struct dsbr100_device *radio) { int retval = usb_control_msg(radio->usbdev, usb_rcvctrlpipe(radio->usbdev, 0), DSB100_ONOFF, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x01, 0x00, radio->transfer_buffer, 8, 300); if (retval >= 0) return dsbr100_setfreq(radio, radio->curfreq); dev_err(&radio->usbdev->dev, "%s - usb_control_msg returned %i, request %i\n", __func__, retval, DSB100_ONOFF); return retval; } /* switch off radio */ static int dsbr100_stop(struct dsbr100_device *radio) { int retval = usb_control_msg(radio->usbdev, usb_rcvctrlpipe(radio->usbdev, 0), DSB100_ONOFF, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x00, 0x00, radio->transfer_buffer, 8, 300); if (retval >= 0) return 0; dev_err(&radio->usbdev->dev, "%s - usb_control_msg returned %i, request %i\n", __func__, retval, DSB100_ONOFF); return retval; } /* return the device status. This is, in effect, just whether it sees a stereo signal or not. Pity. */ static void dsbr100_getstat(struct dsbr100_device *radio) { int retval = usb_control_msg(radio->usbdev, usb_rcvctrlpipe(radio->usbdev, 0), USB_REQ_GET_STATUS, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x00, 0x24, radio->transfer_buffer, 8, 300); if (retval < 0) { radio->stereo = false; dev_err(&radio->usbdev->dev, "%s - usb_control_msg returned %i, request %i\n", __func__, retval, USB_REQ_GET_STATUS); } else { radio->stereo = !(radio->transfer_buffer[0] & 0x01); } } static int vidioc_querycap(struct file *file, void *priv, struct v4l2_capability *v) { struct dsbr100_device *radio = video_drvdata(file); strscpy(v->driver, "dsbr100", sizeof(v->driver)); strscpy(v->card, "D-Link R-100 USB FM Radio", sizeof(v->card)); usb_make_path(radio->usbdev, v->bus_info, sizeof(v->bus_info)); return 0; } static int vidioc_g_tuner(struct file *file, void *priv, struct v4l2_tuner *v) { struct dsbr100_device *radio = video_drvdata(file); if (v->index > 0) return -EINVAL; dsbr100_getstat(radio); strscpy(v->name, "FM", sizeof(v->name)); v->type = V4L2_TUNER_RADIO; v->rangelow = FREQ_MIN * FREQ_MUL; v->rangehigh = FREQ_MAX * FREQ_MUL; v->rxsubchans = radio->stereo ? V4L2_TUNER_SUB_STEREO : V4L2_TUNER_SUB_MONO; v->capability = V4L2_TUNER_CAP_LOW | V4L2_TUNER_CAP_STEREO; v->audmode = V4L2_TUNER_MODE_STEREO; v->signal = radio->stereo ? 0xffff : 0; /* We can't get the signal strength */ return 0; } static int vidioc_s_tuner(struct file *file, void *priv, const struct v4l2_tuner *v) { return v->index ? -EINVAL : 0; } static int vidioc_s_frequency(struct file *file, void *priv, const struct v4l2_frequency *f) { struct dsbr100_device *radio = video_drvdata(file); if (f->tuner != 0 || f->type != V4L2_TUNER_RADIO) return -EINVAL; return dsbr100_setfreq(radio, clamp_t(unsigned, f->frequency, FREQ_MIN * FREQ_MUL, FREQ_MAX * FREQ_MUL)); } static int vidioc_g_frequency(struct file *file, void *priv, struct v4l2_frequency *f) { struct dsbr100_device *radio = video_drvdata(file); if (f->tuner) return -EINVAL; f->type = V4L2_TUNER_RADIO; f->frequency = radio->curfreq; return 0; } static int usb_dsbr100_s_ctrl(struct v4l2_ctrl *ctrl) { struct dsbr100_device *radio = container_of(ctrl->handler, struct dsbr100_device, hdl); switch (ctrl->id) { case V4L2_CID_AUDIO_MUTE: radio->muted = ctrl->val; return radio->muted ? dsbr100_stop(radio) : dsbr100_start(radio); } return -EINVAL; } /* USB subsystem interface begins here */ /* * Handle unplugging of the device. * We call video_unregister_device in any case. * The last function called in this procedure is * usb_dsbr100_video_device_release */ static void usb_dsbr100_disconnect(struct usb_interface *intf) { struct dsbr100_device *radio = usb_get_intfdata(intf); mutex_lock(&radio->v4l2_lock); /* * Disconnect is also called on unload, and in that case we need to * mute the device. This call will silently fail if it is called * after a physical disconnect. */ usb_control_msg(radio->usbdev, usb_rcvctrlpipe(radio->usbdev, 0), DSB100_ONOFF, USB_TYPE_VENDOR | USB_RECIP_DEVICE | USB_DIR_IN, 0x00, 0x00, radio->transfer_buffer, 8, 300); usb_set_intfdata(intf, NULL); video_unregister_device(&radio->videodev); v4l2_device_disconnect(&radio->v4l2_dev); mutex_unlock(&radio->v4l2_lock); v4l2_device_put(&radio->v4l2_dev); } /* Suspend device - stop device. */ static int usb_dsbr100_suspend(struct usb_interface *intf, pm_message_t message) { struct dsbr100_device *radio = usb_get_intfdata(intf); mutex_lock(&radio->v4l2_lock); if (!radio->muted && dsbr100_stop(radio) < 0) dev_warn(&intf->dev, "dsbr100_stop failed\n"); mutex_unlock(&radio->v4l2_lock); dev_info(&intf->dev, "going into suspend..\n"); return 0; } /* Resume device - start device. */ static int usb_dsbr100_resume(struct usb_interface *intf) { struct dsbr100_device *radio = usb_get_intfdata(intf); mutex_lock(&radio->v4l2_lock); if (!radio->muted && dsbr100_start(radio) < 0) dev_warn(&intf->dev, "dsbr100_start failed\n"); mutex_unlock(&radio->v4l2_lock); dev_info(&intf->dev, "coming out of suspend..\n"); return 0; } /* free data structures */ static void usb_dsbr100_release(struct v4l2_device *v4l2_dev) { struct dsbr100_device *radio = v4l2_dev_to_radio(v4l2_dev); v4l2_ctrl_handler_free(&radio->hdl); v4l2_device_unregister(&radio->v4l2_dev); kfree(radio->transfer_buffer); kfree(radio); } static const struct v4l2_ctrl_ops usb_dsbr100_ctrl_ops = { .s_ctrl = usb_dsbr100_s_ctrl, }; /* File system interface */ static const struct v4l2_file_operations usb_dsbr100_fops = { .owner = THIS_MODULE, .unlocked_ioctl = video_ioctl2, .open = v4l2_fh_open, .release = v4l2_fh_release, .poll = v4l2_ctrl_poll, }; static const struct v4l2_ioctl_ops usb_dsbr100_ioctl_ops = { .vidioc_querycap = vidioc_querycap, .vidioc_g_tuner = vidioc_g_tuner, .vidioc_s_tuner = vidioc_s_tuner, .vidioc_g_frequency = vidioc_g_frequency, .vidioc_s_frequency = vidioc_s_frequency, .vidioc_log_status = v4l2_ctrl_log_status, .vidioc_subscribe_event = v4l2_ctrl_subscribe_event, .vidioc_unsubscribe_event = v4l2_event_unsubscribe, }; /* check if the device is present and register with v4l and usb if it is */ static int usb_dsbr100_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct dsbr100_device *radio; struct v4l2_device *v4l2_dev; int retval; radio = kzalloc(sizeof(struct dsbr100_device), GFP_KERNEL); if (!radio) return -ENOMEM; radio->transfer_buffer = kmalloc(TB_LEN, GFP_KERNEL); if (!(radio->transfer_buffer)) { kfree(radio); return -ENOMEM; } v4l2_dev = &radio->v4l2_dev; v4l2_dev->release = usb_dsbr100_release; retval = v4l2_device_register(&intf->dev, v4l2_dev); if (retval < 0) { v4l2_err(v4l2_dev, "couldn't register v4l2_device\n"); goto err_reg_dev; } v4l2_ctrl_handler_init(&radio->hdl, 1); v4l2_ctrl_new_std(&radio->hdl, &usb_dsbr100_ctrl_ops, V4L2_CID_AUDIO_MUTE, 0, 1, 1, 1); if (radio->hdl.error) { retval = radio->hdl.error; v4l2_err(v4l2_dev, "couldn't register control\n"); goto err_reg_ctrl; } mutex_init(&radio->v4l2_lock); strscpy(radio->videodev.name, v4l2_dev->name, sizeof(radio->videodev.name)); radio->videodev.v4l2_dev = v4l2_dev; radio->videodev.fops = &usb_dsbr100_fops; radio->videodev.ioctl_ops = &usb_dsbr100_ioctl_ops; radio->videodev.release = video_device_release_empty; radio->videodev.lock = &radio->v4l2_lock; radio->videodev.ctrl_handler = &radio->hdl; radio->videodev.device_caps = V4L2_CAP_RADIO | V4L2_CAP_TUNER; radio->usbdev = interface_to_usbdev(intf); radio->curfreq = FREQ_MIN * FREQ_MUL; radio->muted = true; video_set_drvdata(&radio->videodev, radio); usb_set_intfdata(intf, radio); retval = video_register_device(&radio->videodev, VFL_TYPE_RADIO, radio_nr); if (retval == 0) return 0; v4l2_err(v4l2_dev, "couldn't register video device\n"); err_reg_ctrl: v4l2_ctrl_handler_free(&radio->hdl); v4l2_device_unregister(v4l2_dev); err_reg_dev: kfree(radio->transfer_buffer); kfree(radio); return retval; } static const struct usb_device_id usb_dsbr100_device_table[] = { { USB_DEVICE(DSB100_VENDOR, DSB100_PRODUCT) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, usb_dsbr100_device_table); /* USB subsystem interface */ static struct usb_driver usb_dsbr100_driver = { .name = "dsbr100", .probe = usb_dsbr100_probe, .disconnect = usb_dsbr100_disconnect, .id_table = usb_dsbr100_device_table, .suspend = usb_dsbr100_suspend, .resume = usb_dsbr100_resume, .reset_resume = usb_dsbr100_resume, }; module_usb_driver(usb_dsbr100_driver); |
| 6 6 6 6 6 6 6 6 1 6 2 6 6 2 6 6 4 6 6 6 6 8 6 2 5 6 5 5 1 8 8 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 | // SPDX-License-Identifier: GPL-2.0-or-later /* * PRNG: Pseudo Random Number Generator * Based on NIST Recommended PRNG From ANSI X9.31 Appendix A.2.4 using * AES 128 cipher * * (C) Neil Horman <nhorman@tuxdriver.com> */ #include <crypto/internal/cipher.h> #include <crypto/internal/rng.h> #include <linux/err.h> #include <linux/init.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/string.h> #define DEFAULT_PRNG_KEY "0123456789abcdef" #define DEFAULT_PRNG_KSZ 16 #define DEFAULT_BLK_SZ 16 #define DEFAULT_V_SEED "zaybxcwdveuftgsh" /* * Flags for the prng_context flags field */ #define PRNG_FIXED_SIZE 0x1 #define PRNG_NEED_RESET 0x2 /* * Note: DT is our counter value * I is our intermediate value * V is our seed vector * See http://csrc.nist.gov/groups/STM/cavp/documents/rng/931rngext.pdf * for implementation details */ struct prng_context { spinlock_t prng_lock; unsigned char rand_data[DEFAULT_BLK_SZ]; unsigned char last_rand_data[DEFAULT_BLK_SZ]; unsigned char DT[DEFAULT_BLK_SZ]; unsigned char I[DEFAULT_BLK_SZ]; unsigned char V[DEFAULT_BLK_SZ]; u32 rand_data_valid; struct crypto_cipher *tfm; u32 flags; }; static int dbg; static void hexdump(char *note, unsigned char *buf, unsigned int len) { if (dbg) { printk(KERN_CRIT "%s", note); print_hex_dump(KERN_CONT, "", DUMP_PREFIX_OFFSET, 16, 1, buf, len, false); } } #define dbgprint(format, args...) do {\ if (dbg)\ printk(format, ##args);\ } while (0) static void xor_vectors(unsigned char *in1, unsigned char *in2, unsigned char *out, unsigned int size) { int i; for (i = 0; i < size; i++) out[i] = in1[i] ^ in2[i]; } /* * Returns DEFAULT_BLK_SZ bytes of random data per call * returns 0 if generation succeeded, <0 if something went wrong */ static int _get_more_prng_bytes(struct prng_context *ctx, int cont_test) { int i; unsigned char tmp[DEFAULT_BLK_SZ]; unsigned char *output = NULL; dbgprint(KERN_CRIT "Calling _get_more_prng_bytes for context %p\n", ctx); hexdump("Input DT: ", ctx->DT, DEFAULT_BLK_SZ); hexdump("Input I: ", ctx->I, DEFAULT_BLK_SZ); hexdump("Input V: ", ctx->V, DEFAULT_BLK_SZ); /* * This algorithm is a 3 stage state machine */ for (i = 0; i < 3; i++) { switch (i) { case 0: /* * Start by encrypting the counter value * This gives us an intermediate value I */ memcpy(tmp, ctx->DT, DEFAULT_BLK_SZ); output = ctx->I; hexdump("tmp stage 0: ", tmp, DEFAULT_BLK_SZ); break; case 1: /* * Next xor I with our secret vector V * encrypt that result to obtain our * pseudo random data which we output */ xor_vectors(ctx->I, ctx->V, tmp, DEFAULT_BLK_SZ); hexdump("tmp stage 1: ", tmp, DEFAULT_BLK_SZ); output = ctx->rand_data; break; case 2: /* * First check that we didn't produce the same * random data that we did last time around through this */ if (!memcmp(ctx->rand_data, ctx->last_rand_data, DEFAULT_BLK_SZ)) { if (cont_test) { panic("cprng %p Failed repetition check!\n", ctx); } printk(KERN_ERR "ctx %p Failed repetition check!\n", ctx); ctx->flags |= PRNG_NEED_RESET; return -EINVAL; } memcpy(ctx->last_rand_data, ctx->rand_data, DEFAULT_BLK_SZ); /* * Lastly xor the random data with I * and encrypt that to obtain a new secret vector V */ xor_vectors(ctx->rand_data, ctx->I, tmp, DEFAULT_BLK_SZ); output = ctx->V; hexdump("tmp stage 2: ", tmp, DEFAULT_BLK_SZ); break; } /* do the encryption */ crypto_cipher_encrypt_one(ctx->tfm, output, tmp); } /* * Now update our DT value */ for (i = DEFAULT_BLK_SZ - 1; i >= 0; i--) { ctx->DT[i] += 1; if (ctx->DT[i] != 0) break; } dbgprint("Returning new block for context %p\n", ctx); ctx->rand_data_valid = 0; hexdump("Output DT: ", ctx->DT, DEFAULT_BLK_SZ); hexdump("Output I: ", ctx->I, DEFAULT_BLK_SZ); hexdump("Output V: ", ctx->V, DEFAULT_BLK_SZ); hexdump("New Random Data: ", ctx->rand_data, DEFAULT_BLK_SZ); return 0; } /* Our exported functions */ static int get_prng_bytes(char *buf, size_t nbytes, struct prng_context *ctx, int do_cont_test) { unsigned char *ptr = buf; unsigned int byte_count = (unsigned int)nbytes; int err; spin_lock_bh(&ctx->prng_lock); err = -EINVAL; if (ctx->flags & PRNG_NEED_RESET) goto done; /* * If the FIXED_SIZE flag is on, only return whole blocks of * pseudo random data */ err = -EINVAL; if (ctx->flags & PRNG_FIXED_SIZE) { if (nbytes < DEFAULT_BLK_SZ) goto done; byte_count = DEFAULT_BLK_SZ; } /* * Return 0 in case of success as mandated by the kernel * crypto API interface definition. */ err = 0; dbgprint(KERN_CRIT "getting %d random bytes for context %p\n", byte_count, ctx); remainder: if (ctx->rand_data_valid == DEFAULT_BLK_SZ) { if (_get_more_prng_bytes(ctx, do_cont_test) < 0) { memset(buf, 0, nbytes); err = -EINVAL; goto done; } } /* * Copy any data less than an entire block */ if (byte_count < DEFAULT_BLK_SZ) { empty_rbuf: while (ctx->rand_data_valid < DEFAULT_BLK_SZ) { *ptr = ctx->rand_data[ctx->rand_data_valid]; ptr++; byte_count--; ctx->rand_data_valid++; if (byte_count == 0) goto done; } } /* * Now copy whole blocks */ for (; byte_count >= DEFAULT_BLK_SZ; byte_count -= DEFAULT_BLK_SZ) { if (ctx->rand_data_valid == DEFAULT_BLK_SZ) { if (_get_more_prng_bytes(ctx, do_cont_test) < 0) { memset(buf, 0, nbytes); err = -EINVAL; goto done; } } if (ctx->rand_data_valid > 0) goto empty_rbuf; memcpy(ptr, ctx->rand_data, DEFAULT_BLK_SZ); ctx->rand_data_valid += DEFAULT_BLK_SZ; ptr += DEFAULT_BLK_SZ; } /* * Now go back and get any remaining partial block */ if (byte_count) goto remainder; done: spin_unlock_bh(&ctx->prng_lock); dbgprint(KERN_CRIT "returning %d from get_prng_bytes in context %p\n", err, ctx); return err; } static void free_prng_context(struct prng_context *ctx) { crypto_free_cipher(ctx->tfm); } static int reset_prng_context(struct prng_context *ctx, const unsigned char *key, size_t klen, const unsigned char *V, const unsigned char *DT) { int ret; const unsigned char *prng_key; spin_lock_bh(&ctx->prng_lock); ctx->flags |= PRNG_NEED_RESET; prng_key = (key != NULL) ? key : (unsigned char *)DEFAULT_PRNG_KEY; if (!key) klen = DEFAULT_PRNG_KSZ; if (V) memcpy(ctx->V, V, DEFAULT_BLK_SZ); else memcpy(ctx->V, DEFAULT_V_SEED, DEFAULT_BLK_SZ); if (DT) memcpy(ctx->DT, DT, DEFAULT_BLK_SZ); else memset(ctx->DT, 0, DEFAULT_BLK_SZ); memset(ctx->rand_data, 0, DEFAULT_BLK_SZ); memset(ctx->last_rand_data, 0, DEFAULT_BLK_SZ); ctx->rand_data_valid = DEFAULT_BLK_SZ; ret = crypto_cipher_setkey(ctx->tfm, prng_key, klen); if (ret) { dbgprint(KERN_CRIT "PRNG: setkey() failed flags=%x\n", crypto_cipher_get_flags(ctx->tfm)); goto out; } ret = 0; ctx->flags &= ~PRNG_NEED_RESET; out: spin_unlock_bh(&ctx->prng_lock); return ret; } static int cprng_init(struct crypto_tfm *tfm) { struct prng_context *ctx = crypto_tfm_ctx(tfm); spin_lock_init(&ctx->prng_lock); ctx->tfm = crypto_alloc_cipher("aes", 0, 0); if (IS_ERR(ctx->tfm)) { dbgprint(KERN_CRIT "Failed to alloc tfm for context %p\n", ctx); return PTR_ERR(ctx->tfm); } if (reset_prng_context(ctx, NULL, DEFAULT_PRNG_KSZ, NULL, NULL) < 0) return -EINVAL; /* * after allocation, we should always force the user to reset * so they don't inadvertently use the insecure default values * without specifying them intentially */ ctx->flags |= PRNG_NEED_RESET; return 0; } static void cprng_exit(struct crypto_tfm *tfm) { free_prng_context(crypto_tfm_ctx(tfm)); } static int cprng_get_random(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *rdata, unsigned int dlen) { struct prng_context *prng = crypto_rng_ctx(tfm); return get_prng_bytes(rdata, dlen, prng, 0); } /* * This is the cprng_registered reset method the seed value is * interpreted as the tuple { V KEY DT} * V and KEY are required during reset, and DT is optional, detected * as being present by testing the length of the seed */ static int cprng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { struct prng_context *prng = crypto_rng_ctx(tfm); const u8 *key = seed + DEFAULT_BLK_SZ; const u8 *dt = NULL; if (slen < DEFAULT_PRNG_KSZ + DEFAULT_BLK_SZ) return -EINVAL; if (slen >= (2 * DEFAULT_BLK_SZ + DEFAULT_PRNG_KSZ)) dt = key + DEFAULT_PRNG_KSZ; reset_prng_context(prng, key, DEFAULT_PRNG_KSZ, seed, dt); if (prng->flags & PRNG_NEED_RESET) return -EINVAL; return 0; } #ifdef CONFIG_CRYPTO_FIPS static int fips_cprng_get_random(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *rdata, unsigned int dlen) { struct prng_context *prng = crypto_rng_ctx(tfm); return get_prng_bytes(rdata, dlen, prng, 1); } static int fips_cprng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen) { u8 rdata[DEFAULT_BLK_SZ]; const u8 *key = seed + DEFAULT_BLK_SZ; int rc; struct prng_context *prng = crypto_rng_ctx(tfm); if (slen < DEFAULT_PRNG_KSZ + DEFAULT_BLK_SZ) return -EINVAL; /* fips strictly requires seed != key */ if (!memcmp(seed, key, DEFAULT_PRNG_KSZ)) return -EINVAL; rc = cprng_reset(tfm, seed, slen); if (!rc) goto out; /* this primes our continuity test */ rc = get_prng_bytes(rdata, DEFAULT_BLK_SZ, prng, 0); prng->rand_data_valid = DEFAULT_BLK_SZ; out: return rc; } #endif static struct rng_alg rng_algs[] = { { .generate = cprng_get_random, .seed = cprng_reset, .seedsize = DEFAULT_PRNG_KSZ + 2 * DEFAULT_BLK_SZ, .base = { .cra_name = "stdrng", .cra_driver_name = "ansi_cprng", .cra_priority = 100, .cra_ctxsize = sizeof(struct prng_context), .cra_module = THIS_MODULE, .cra_init = cprng_init, .cra_exit = cprng_exit, } #ifdef CONFIG_CRYPTO_FIPS }, { .generate = fips_cprng_get_random, .seed = fips_cprng_reset, .seedsize = DEFAULT_PRNG_KSZ + 2 * DEFAULT_BLK_SZ, .base = { .cra_name = "fips(ansi_cprng)", .cra_driver_name = "fips_ansi_cprng", .cra_priority = 300, .cra_ctxsize = sizeof(struct prng_context), .cra_module = THIS_MODULE, .cra_init = cprng_init, .cra_exit = cprng_exit, } #endif } }; /* Module initalization */ static int __init prng_mod_init(void) { return crypto_register_rngs(rng_algs, ARRAY_SIZE(rng_algs)); } static void __exit prng_mod_fini(void) { crypto_unregister_rngs(rng_algs, ARRAY_SIZE(rng_algs)); } MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Software Pseudo Random Number Generator"); MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>"); module_param(dbg, int, 0); MODULE_PARM_DESC(dbg, "Boolean to enable debugging (0/1 == off/on)"); module_init(prng_mod_init); module_exit(prng_mod_fini); MODULE_ALIAS_CRYPTO("stdrng"); MODULE_ALIAS_CRYPTO("ansi_cprng"); MODULE_IMPORT_NS("CRYPTO_INTERNAL"); |
| 288 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Virtio PCI driver - common functionality for all device versions * * This module allows virtio devices to be used over a virtual PCI device. * This can be used with QEMU based VMMs like KVM or Xen. * * Copyright IBM Corp. 2007 * Copyright Red Hat, Inc. 2014 * * Authors: * Anthony Liguori <aliguori@us.ibm.com> * Rusty Russell <rusty@rustcorp.com.au> * Michael S. Tsirkin <mst@redhat.com> */ #include "virtio_pci_common.h" static bool force_legacy = false; #if IS_ENABLED(CONFIG_VIRTIO_PCI_LEGACY) module_param(force_legacy, bool, 0444); MODULE_PARM_DESC(force_legacy, "Force legacy mode for transitional virtio 1 devices"); #endif bool vp_is_avq(struct virtio_device *vdev, unsigned int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) return false; return index == vp_dev->admin_vq.vq_index; } /* wait for pending irq handlers */ void vp_synchronize_vectors(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); int i; if (vp_dev->intx_enabled) synchronize_irq(vp_dev->pci_dev->irq); for (i = 0; i < vp_dev->msix_vectors; ++i) synchronize_irq(pci_irq_vector(vp_dev->pci_dev, i)); } /* the notify function used when creating a virt queue */ bool vp_notify(struct virtqueue *vq) { /* we write the queue's selector into the notification register to * signal the other end */ iowrite16(vq->index, (void __iomem *)vq->priv); return true; } /* Notify all slow path virtqueues on an interrupt. */ static void vp_vring_slow_path_interrupt(int irq, struct virtio_pci_device *vp_dev) { struct virtio_pci_vq_info *info; unsigned long flags; spin_lock_irqsave(&vp_dev->lock, flags); list_for_each_entry(info, &vp_dev->slow_virtqueues, node) vring_interrupt(irq, info->vq); spin_unlock_irqrestore(&vp_dev->lock, flags); } /* Handle a configuration change: Tell driver if it wants to know. */ static irqreturn_t vp_config_changed(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; virtio_config_changed(&vp_dev->vdev); vp_vring_slow_path_interrupt(irq, vp_dev); return IRQ_HANDLED; } /* Notify all virtqueues on an interrupt. */ static irqreturn_t vp_vring_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; struct virtio_pci_vq_info *info; irqreturn_t ret = IRQ_NONE; unsigned long flags; spin_lock_irqsave(&vp_dev->lock, flags); list_for_each_entry(info, &vp_dev->virtqueues, node) { if (vring_interrupt(irq, info->vq) == IRQ_HANDLED) ret = IRQ_HANDLED; } spin_unlock_irqrestore(&vp_dev->lock, flags); return ret; } /* A small wrapper to also acknowledge the interrupt when it's handled. * I really need an EIO hook for the vring so I can ack the interrupt once we * know that we'll be handling the IRQ but before we invoke the callback since * the callback may notify the host which results in the host attempting to * raise an interrupt that we would then mask once we acknowledged the * interrupt. */ static irqreturn_t vp_interrupt(int irq, void *opaque) { struct virtio_pci_device *vp_dev = opaque; u8 isr; /* reading the ISR has the effect of also clearing it so it's very * important to save off the value. */ isr = ioread8(vp_dev->isr); /* It's definitely not us if the ISR was not high */ if (!isr) return IRQ_NONE; /* Configuration change? Tell driver if it wants to know. */ if (isr & VIRTIO_PCI_ISR_CONFIG) vp_config_changed(irq, opaque); return vp_vring_interrupt(irq, opaque); } static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, bool per_vq_vectors, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); const char *name = dev_name(&vp_dev->vdev.dev); unsigned int flags = PCI_IRQ_MSIX; unsigned int i, v; int err = -ENOMEM; vp_dev->msix_vectors = nvectors; vp_dev->msix_names = kmalloc_array(nvectors, sizeof(*vp_dev->msix_names), GFP_KERNEL); if (!vp_dev->msix_names) goto error; vp_dev->msix_affinity_masks = kcalloc(nvectors, sizeof(*vp_dev->msix_affinity_masks), GFP_KERNEL); if (!vp_dev->msix_affinity_masks) goto error; for (i = 0; i < nvectors; ++i) if (!alloc_cpumask_var(&vp_dev->msix_affinity_masks[i], GFP_KERNEL)) goto error; if (!per_vq_vectors) desc = NULL; if (desc) { flags |= PCI_IRQ_AFFINITY; desc->pre_vectors++; /* virtio config vector */ } err = pci_alloc_irq_vectors_affinity(vp_dev->pci_dev, nvectors, nvectors, flags, desc); if (err < 0) goto error; vp_dev->msix_enabled = 1; /* Set the vector used for configuration */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, "%s-config", name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), vp_config_changed, 0, vp_dev->msix_names[v], vp_dev); if (err) goto error; ++vp_dev->msix_used_vectors; v = vp_dev->config_vector(vp_dev, v); /* Verify we had enough resources to assign the vector */ if (v == VIRTIO_MSI_NO_VECTOR) { err = -EBUSY; goto error; } if (!per_vq_vectors) { /* Shared vector for all VQs */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, "%s-virtqueues", name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, v), vp_vring_interrupt, 0, vp_dev->msix_names[v], vp_dev); if (err) goto error; ++vp_dev->msix_used_vectors; } return 0; error: return err; } static bool vp_is_slow_path_vector(u16 msix_vec) { return msix_vec == VP_MSIX_CONFIG_VECTOR; } static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned int index, void (*callback)(struct virtqueue *vq), const char *name, bool ctx, u16 msix_vec, struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = kmalloc(sizeof *info, GFP_KERNEL); struct virtqueue *vq; unsigned long flags; /* fill out our structure that represents an active queue */ if (!info) return ERR_PTR(-ENOMEM); vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx, msix_vec); if (IS_ERR(vq)) goto out_info; info->vq = vq; if (callback) { spin_lock_irqsave(&vp_dev->lock, flags); if (!vp_is_slow_path_vector(msix_vec)) list_add(&info->node, &vp_dev->virtqueues); else list_add(&info->node, &vp_dev->slow_virtqueues); spin_unlock_irqrestore(&vp_dev->lock, flags); } else { INIT_LIST_HEAD(&info->node); } *p_info = info; return vq; out_info: kfree(info); return vq; } static void vp_del_vq(struct virtqueue *vq, struct virtio_pci_vq_info *info) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); unsigned long flags; /* * If it fails during re-enable reset vq. This way we won't rejoin * info->node to the queue. Prevent unexpected irqs. */ if (!vq->reset) { spin_lock_irqsave(&vp_dev->lock, flags); list_del(&info->node); spin_unlock_irqrestore(&vp_dev->lock, flags); } vp_dev->del_vq(info); kfree(info); } /* the config->del_vqs() implementation */ void vp_del_vqs(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info; struct virtqueue *vq, *n; int i; list_for_each_entry_safe(vq, n, &vdev->vqs, list) { info = vp_is_avq(vdev, vq->index) ? vp_dev->admin_vq.info : vp_dev->vqs[vq->index]; if (vp_dev->per_vq_vectors) { int v = info->msix_vector; if (v != VIRTIO_MSI_NO_VECTOR && !vp_is_slow_path_vector(v)) { int irq = pci_irq_vector(vp_dev->pci_dev, v); irq_update_affinity_hint(irq, NULL); free_irq(irq, vq); } } vp_del_vq(vq, info); } vp_dev->per_vq_vectors = false; if (vp_dev->intx_enabled) { free_irq(vp_dev->pci_dev->irq, vp_dev); vp_dev->intx_enabled = 0; } for (i = 0; i < vp_dev->msix_used_vectors; ++i) free_irq(pci_irq_vector(vp_dev->pci_dev, i), vp_dev); if (vp_dev->msix_affinity_masks) { for (i = 0; i < vp_dev->msix_vectors; i++) free_cpumask_var(vp_dev->msix_affinity_masks[i]); } if (vp_dev->msix_enabled) { /* Disable the vector used for configuration */ vp_dev->config_vector(vp_dev, VIRTIO_MSI_NO_VECTOR); pci_free_irq_vectors(vp_dev->pci_dev); vp_dev->msix_enabled = 0; } vp_dev->msix_vectors = 0; vp_dev->msix_used_vectors = 0; kfree(vp_dev->msix_names); vp_dev->msix_names = NULL; kfree(vp_dev->msix_affinity_masks); vp_dev->msix_affinity_masks = NULL; kfree(vp_dev->vqs); vp_dev->vqs = NULL; } enum vp_vq_vector_policy { VP_VQ_VECTOR_POLICY_EACH, VP_VQ_VECTOR_POLICY_SHARED_SLOW, VP_VQ_VECTOR_POLICY_SHARED, }; static struct virtqueue * vp_find_one_vq_msix(struct virtio_device *vdev, int queue_idx, vq_callback_t *callback, const char *name, bool ctx, bool slow_path, int *allocated_vectors, enum vp_vq_vector_policy vector_policy, struct virtio_pci_vq_info **p_info) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq; u16 msix_vec; int err; if (!callback) msix_vec = VIRTIO_MSI_NO_VECTOR; else if (vector_policy == VP_VQ_VECTOR_POLICY_EACH || (vector_policy == VP_VQ_VECTOR_POLICY_SHARED_SLOW && !slow_path)) msix_vec = (*allocated_vectors)++; else if (vector_policy != VP_VQ_VECTOR_POLICY_EACH && slow_path) msix_vec = VP_MSIX_CONFIG_VECTOR; else msix_vec = VP_MSIX_VQ_VECTOR; vq = vp_setup_vq(vdev, queue_idx, callback, name, ctx, msix_vec, p_info); if (IS_ERR(vq)) return vq; if (vector_policy == VP_VQ_VECTOR_POLICY_SHARED || msix_vec == VIRTIO_MSI_NO_VECTOR || vp_is_slow_path_vector(msix_vec)) return vq; /* allocate per-vq irq if available and necessary */ snprintf(vp_dev->msix_names[msix_vec], sizeof(*vp_dev->msix_names), "%s-%s", dev_name(&vp_dev->vdev.dev), name); err = request_irq(pci_irq_vector(vp_dev->pci_dev, msix_vec), vring_interrupt, 0, vp_dev->msix_names[msix_vec], vq); if (err) { vp_del_vq(vq, *p_info); return ERR_PTR(err); } return vq; } static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], enum vp_vq_vector_policy vector_policy, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; struct virtqueue_info *vqi; int i, err, nvectors, allocated_vectors, queue_idx = 0; struct virtqueue *vq; bool per_vq_vectors; u16 avq_num = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; if (vp_dev->avq_index) { err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); if (err) goto error_find; } per_vq_vectors = vector_policy != VP_VQ_VECTOR_POLICY_SHARED; if (per_vq_vectors) { /* Best option: one for change interrupt, one per vq. */ nvectors = 1; for (i = 0; i < nvqs; ++i) { vqi = &vqs_info[i]; if (vqi->name && vqi->callback) ++nvectors; } if (avq_num && vector_policy == VP_VQ_VECTOR_POLICY_EACH) ++nvectors; } else { /* Second best: one for change, shared for all vqs. */ nvectors = 2; } err = vp_request_msix_vectors(vdev, nvectors, per_vq_vectors, desc); if (err) goto error_find; vp_dev->per_vq_vectors = per_vq_vectors; allocated_vectors = vp_dev->msix_used_vectors; for (i = 0; i < nvqs; ++i) { vqi = &vqs_info[i]; if (!vqi->name) { vqs[i] = NULL; continue; } vqs[i] = vp_find_one_vq_msix(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, false, &allocated_vectors, vector_policy, &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto error_find; } } if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); vq = vp_find_one_vq_msix(vdev, avq->vq_index, vp_modern_avq_done, avq->name, false, true, &allocated_vectors, vector_policy, &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto error_find; } return 0; error_find: vp_del_vqs(vdev); return err; } static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[]) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_admin_vq *avq = &vp_dev->admin_vq; int i, err, queue_idx = 0; struct virtqueue *vq; u16 avq_num = 0; vp_dev->vqs = kcalloc(nvqs, sizeof(*vp_dev->vqs), GFP_KERNEL); if (!vp_dev->vqs) return -ENOMEM; if (vp_dev->avq_index) { err = vp_dev->avq_index(vdev, &avq->vq_index, &avq_num); if (err) goto out_del_vqs; } err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vp_dev); if (err) goto out_del_vqs; vp_dev->intx_enabled = 1; vp_dev->per_vq_vectors = false; for (i = 0; i < nvqs; ++i) { struct virtqueue_info *vqi = &vqs_info[i]; if (!vqi->name) { vqs[i] = NULL; continue; } vqs[i] = vp_setup_vq(vdev, queue_idx++, vqi->callback, vqi->name, vqi->ctx, VIRTIO_MSI_NO_VECTOR, &vp_dev->vqs[i]); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); goto out_del_vqs; } } if (!avq_num) return 0; sprintf(avq->name, "avq.%u", avq->vq_index); vq = vp_setup_vq(vdev, queue_idx++, vp_modern_avq_done, avq->name, false, VIRTIO_MSI_NO_VECTOR, &vp_dev->admin_vq.info); if (IS_ERR(vq)) { err = PTR_ERR(vq); goto out_del_vqs; } return 0; out_del_vqs: vp_del_vqs(vdev); return err; } /* the config->find_vqs() implementation */ int vp_find_vqs(struct virtio_device *vdev, unsigned int nvqs, struct virtqueue *vqs[], struct virtqueue_info vqs_info[], struct irq_affinity *desc) { int err; /* Try MSI-X with one vector per queue. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_EACH, desc); if (!err) return 0; /* Fallback: MSI-X with one shared vector for config and * slow path queues, one vector per queue for the rest. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_SHARED_SLOW, desc); if (!err) return 0; /* Fallback: MSI-X with one vector for config, one shared for queues. */ err = vp_find_vqs_msix(vdev, nvqs, vqs, vqs_info, VP_VQ_VECTOR_POLICY_SHARED, desc); if (!err) return 0; /* Is there an interrupt? If not give up. */ if (!(to_vp_device(vdev)->pci_dev->irq)) return err; /* Finally fall back to regular interrupts. */ return vp_find_vqs_intx(vdev, nvqs, vqs, vqs_info); } const char *vp_bus_name(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); return pci_name(vp_dev->pci_dev); } /* Setup the affinity for a virtqueue: * - force the affinity for per vq vector * - OR over all affinities for shared MSI * - ignore the affinity request if we're using INTX */ int vp_set_vq_affinity(struct virtqueue *vq, const struct cpumask *cpu_mask) { struct virtio_device *vdev = vq->vdev; struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info = vp_dev->vqs[vq->index]; struct cpumask *mask; unsigned int irq; if (!vq->callback) return -EINVAL; if (vp_dev->msix_enabled) { mask = vp_dev->msix_affinity_masks[info->msix_vector]; irq = pci_irq_vector(vp_dev->pci_dev, info->msix_vector); if (!cpu_mask) irq_update_affinity_hint(irq, NULL); else { cpumask_copy(mask, cpu_mask); irq_set_affinity_and_hint(irq, mask); } } return 0; } const struct cpumask *vp_get_vq_affinity(struct virtio_device *vdev, int index) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); if (!vp_dev->per_vq_vectors || vp_dev->vqs[index]->msix_vector == VIRTIO_MSI_NO_VECTOR || vp_is_slow_path_vector(vp_dev->vqs[index]->msix_vector)) return NULL; return pci_irq_get_affinity(vp_dev->pci_dev, vp_dev->vqs[index]->msix_vector); } #ifdef CONFIG_PM_SLEEP static int virtio_pci_freeze(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; ret = virtio_device_freeze(&vp_dev->vdev); if (!ret) pci_disable_device(pci_dev); return ret; } static int virtio_pci_restore(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; ret = pci_enable_device(pci_dev); if (ret) return ret; pci_set_master(pci_dev); return virtio_device_restore(&vp_dev->vdev); } static bool vp_supports_pm_no_reset(struct device *dev) { struct pci_dev *pci_dev = to_pci_dev(dev); u16 pmcsr; if (!pci_dev->pm_cap) return false; pci_read_config_word(pci_dev, pci_dev->pm_cap + PCI_PM_CTRL, &pmcsr); if (PCI_POSSIBLE_ERROR(pmcsr)) { dev_err(dev, "Unable to query pmcsr"); return false; } return pmcsr & PCI_PM_CTRL_NO_SOFT_RESET; } static int virtio_pci_suspend(struct device *dev) { return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_freeze(dev); } static int virtio_pci_resume(struct device *dev) { return vp_supports_pm_no_reset(dev) ? 0 : virtio_pci_restore(dev); } static const struct dev_pm_ops virtio_pci_pm_ops = { .suspend = virtio_pci_suspend, .resume = virtio_pci_resume, .freeze = virtio_pci_freeze, .thaw = virtio_pci_restore, .poweroff = virtio_pci_freeze, .restore = virtio_pci_restore, }; #endif /* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */ static const struct pci_device_id virtio_pci_id_table[] = { { PCI_DEVICE(PCI_VENDOR_ID_REDHAT_QUMRANET, PCI_ANY_ID) }, { 0 } }; MODULE_DEVICE_TABLE(pci, virtio_pci_id_table); static void virtio_pci_release_dev(struct device *_d) { struct virtio_device *vdev = dev_to_virtio(_d); struct virtio_pci_device *vp_dev = to_vp_device(vdev); /* As struct device is a kobject, it's not safe to * free the memory (including the reference counter itself) * until it's release callback. */ kfree(vp_dev); } static int virtio_pci_probe(struct pci_dev *pci_dev, const struct pci_device_id *id) { struct virtio_pci_device *vp_dev, *reg_dev = NULL; int rc; /* allocate our structure and fill it out */ vp_dev = kzalloc(sizeof(struct virtio_pci_device), GFP_KERNEL); if (!vp_dev) return -ENOMEM; pci_set_drvdata(pci_dev, vp_dev); vp_dev->vdev.dev.parent = &pci_dev->dev; vp_dev->vdev.dev.release = virtio_pci_release_dev; vp_dev->pci_dev = pci_dev; INIT_LIST_HEAD(&vp_dev->virtqueues); INIT_LIST_HEAD(&vp_dev->slow_virtqueues); spin_lock_init(&vp_dev->lock); /* enable the device */ rc = pci_enable_device(pci_dev); if (rc) goto err_enable_device; if (force_legacy) { rc = virtio_pci_legacy_probe(vp_dev); /* Also try modern mode if we can't map BAR0 (no IO space). */ if (rc == -ENODEV || rc == -ENOMEM) rc = virtio_pci_modern_probe(vp_dev); if (rc) goto err_probe; } else { rc = virtio_pci_modern_probe(vp_dev); if (rc == -ENODEV) rc = virtio_pci_legacy_probe(vp_dev); if (rc) goto err_probe; } pci_set_master(pci_dev); rc = register_virtio_device(&vp_dev->vdev); reg_dev = vp_dev; if (rc) goto err_register; return 0; err_register: if (vp_dev->is_legacy) virtio_pci_legacy_remove(vp_dev); else virtio_pci_modern_remove(vp_dev); err_probe: pci_disable_device(pci_dev); err_enable_device: if (reg_dev) put_device(&vp_dev->vdev.dev); else kfree(vp_dev); return rc; } static void virtio_pci_remove(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct device *dev = get_device(&vp_dev->vdev.dev); /* * Device is marked broken on surprise removal so that virtio upper * layers can abort any ongoing operation. */ if (!pci_device_is_present(pci_dev)) virtio_break_device(&vp_dev->vdev); pci_disable_sriov(pci_dev); unregister_virtio_device(&vp_dev->vdev); if (vp_dev->is_legacy) virtio_pci_legacy_remove(vp_dev); else virtio_pci_modern_remove(vp_dev); pci_disable_device(pci_dev); put_device(dev); } static int virtio_pci_sriov_configure(struct pci_dev *pci_dev, int num_vfs) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct virtio_device *vdev = &vp_dev->vdev; int ret; if (!(vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK)) return -EBUSY; if (!__virtio_test_bit(vdev, VIRTIO_F_SR_IOV)) return -EINVAL; if (pci_vfs_assigned(pci_dev)) return -EPERM; if (num_vfs == 0) { pci_disable_sriov(pci_dev); return 0; } ret = pci_enable_sriov(pci_dev, num_vfs); if (ret < 0) return ret; return num_vfs; } static void virtio_pci_reset_prepare(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret = 0; ret = virtio_device_reset_prepare(&vp_dev->vdev); if (ret) { if (ret != -EOPNOTSUPP) dev_warn(&pci_dev->dev, "Reset prepare failure: %d", ret); return; } if (pci_is_enabled(pci_dev)) pci_disable_device(pci_dev); } static void virtio_pci_reset_done(struct pci_dev *pci_dev) { struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); int ret; if (pci_is_enabled(pci_dev)) return; ret = pci_enable_device(pci_dev); if (!ret) { pci_set_master(pci_dev); ret = virtio_device_reset_done(&vp_dev->vdev); } if (ret && ret != -EOPNOTSUPP) dev_warn(&pci_dev->dev, "Reset done failure: %d", ret); } static const struct pci_error_handlers virtio_pci_err_handler = { .reset_prepare = virtio_pci_reset_prepare, .reset_done = virtio_pci_reset_done, }; static struct pci_driver virtio_pci_driver = { .name = "virtio-pci", .id_table = virtio_pci_id_table, .probe = virtio_pci_probe, .remove = virtio_pci_remove, #ifdef CONFIG_PM_SLEEP .driver.pm = &virtio_pci_pm_ops, #endif .sriov_configure = virtio_pci_sriov_configure, .err_handler = &virtio_pci_err_handler, }; struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev) { struct virtio_pci_device *pf_vp_dev; pf_vp_dev = pci_iov_get_pf_drvdata(pdev, &virtio_pci_driver); if (IS_ERR(pf_vp_dev)) return NULL; return &pf_vp_dev->vdev; } module_pci_driver(virtio_pci_driver); MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>"); MODULE_DESCRIPTION("virtio-pci"); MODULE_LICENSE("GPL"); MODULE_VERSION("1"); |
| 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/icmpv6.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/spinlock.h> #include <net/ipv6.h> #if IS_ENABLED(CONFIG_IPV6) #if !IS_BUILTIN(CONFIG_IPV6) static ip6_icmp_send_t __rcu *ip6_icmp_send; int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ? 0 : -EBUSY; } EXPORT_SYMBOL(inet6_register_icmp_sender); int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) { int ret; ret = (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, fn, NULL) == fn) ? 0 : -EINVAL; synchronize_net(); return ret; } EXPORT_SYMBOL(inet6_unregister_icmp_sender); void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm) { ip6_icmp_send_t *send; rcu_read_lock(); send = rcu_dereference(ip6_icmp_send); if (send) send(skb, type, code, info, NULL, parm); rcu_read_unlock(); } EXPORT_SYMBOL(__icmpv6_send); #endif #if IS_ENABLED(CONFIG_NF_NAT) #include <net/netfilter/nf_conntrack.h> void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { struct inet6_skb_parm parm = { 0 }; struct sk_buff *cloned_skb = NULL; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; struct in6_addr orig_ip; struct nf_conn *ct; ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { __icmpv6_send(skb_in, type, code, info, &parm); return; } if (skb_shared(skb_in)) skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC); if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head || (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) > skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in, skb_network_offset(skb_in) + sizeof(struct ipv6hdr)))) goto out; orig_ip = ipv6_hdr(skb_in)->saddr; dir = CTINFO2DIR(ctinfo); ipv6_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.in6; __icmpv6_send(skb_in, type, code, info, &parm); ipv6_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); } EXPORT_SYMBOL(icmpv6_ndo_send); #endif #endif |
| 4 4 4 4 4 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 | // SPDX-License-Identifier: GPL-2.0 /* * cfg80211 wext compat for managed mode. * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2009, 2020-2023 Intel Corporation */ #include <linux/export.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <net/cfg80211.h> #include <net/cfg80211-wext.h> #include "wext-compat.h" #include "nl80211.h" int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { struct cfg80211_cached_keys *ck = NULL; const u8 *prev_bssid = NULL; int err, i; ASSERT_RTNL(); lockdep_assert_wiphy(wdev->wiphy); if (!netif_running(wdev->netdev)) return 0; wdev->wext.connect.ie = wdev->wext.ie; wdev->wext.connect.ie_len = wdev->wext.ie_len; /* Use default background scan period */ wdev->wext.connect.bg_scan_period = -1; if (wdev->wext.keys) { wdev->wext.keys->def = wdev->wext.default_key; if (wdev->wext.default_key != -1) wdev->wext.connect.privacy = true; } if (!wdev->wext.connect.ssid_len) return 0; if (wdev->wext.keys && wdev->wext.keys->def != -1) { ck = kmemdup(wdev->wext.keys, sizeof(*ck), GFP_KERNEL); if (!ck) return -ENOMEM; for (i = 0; i < 4; i++) ck->params[i].key = ck->data[i]; } if (wdev->wext.prev_bssid_valid) prev_bssid = wdev->wext.prev_bssid; err = cfg80211_connect(rdev, wdev->netdev, &wdev->wext.connect, ck, prev_bssid); if (err) kfree_sensitive(ck); return err; } int cfg80211_mgd_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *wextfreq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct ieee80211_channel *chan = NULL; int err, freq; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; freq = cfg80211_wext_freq(wextfreq); if (freq < 0) return freq; if (freq) { chan = ieee80211_get_channel(wdev->wiphy, freq); if (!chan) return -EINVAL; if (chan->flags & IEEE80211_CHAN_DISABLED) return -EINVAL; } if (wdev->conn) { bool event = true; if (wdev->wext.connect.channel == chan) return 0; /* if SSID set, we'll try right again, avoid event */ if (wdev->wext.connect.ssid_len) event = false; err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, event); if (err) return err; } wdev->wext.connect.channel = chan; return cfg80211_mgd_wext_connect(rdev, wdev); } int cfg80211_mgd_wext_giwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *freq, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct ieee80211_channel *chan = NULL; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (wdev->valid_links) return -EOPNOTSUPP; if (wdev->links[0].client.current_bss) chan = wdev->links[0].client.current_bss->pub.channel; else if (wdev->wext.connect.channel) chan = wdev->wext.connect.channel; if (chan) { freq->m = chan->center_freq; freq->e = 6; return 0; } /* no channel if not joining */ return -EINVAL; } int cfg80211_mgd_wext_siwessid(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); size_t len = data->length; int err; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (!data->flags) len = 0; /* iwconfig uses nul termination in SSID.. */ if (len > 0 && ssid[len - 1] == '\0') len--; if (wdev->conn) { bool event = true; if (wdev->wext.connect.ssid && len && len == wdev->wext.connect.ssid_len && memcmp(wdev->wext.connect.ssid, ssid, len) == 0) return 0; /* if SSID set now, we'll try to connect, avoid event */ if (len) event = false; err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, event); if (err) return err; } wdev->wext.prev_bssid_valid = false; wdev->wext.connect.ssid = wdev->wext.ssid; memcpy(wdev->wext.ssid, ssid, len); wdev->wext.connect.ssid_len = len; wdev->wext.connect.crypto.control_port = false; wdev->wext.connect.crypto.control_port_ethertype = cpu_to_be16(ETH_P_PAE); return cfg80211_mgd_wext_connect(rdev, wdev); } int cfg80211_mgd_wext_giwessid(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *ssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; int ret = 0; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (wdev->valid_links) return -EINVAL; data->flags = 0; if (wdev->links[0].client.current_bss) { const struct element *ssid_elem; rcu_read_lock(); ssid_elem = ieee80211_bss_get_elem( &wdev->links[0].client.current_bss->pub, WLAN_EID_SSID); if (ssid_elem) { data->flags = 1; data->length = ssid_elem->datalen; if (data->length > IW_ESSID_MAX_SIZE) ret = -EINVAL; else memcpy(ssid, ssid_elem->data, data->length); } rcu_read_unlock(); } else if (wdev->wext.connect.ssid && wdev->wext.connect.ssid_len) { data->flags = 1; data->length = wdev->wext.connect.ssid_len; memcpy(ssid, wdev->wext.connect.ssid, data->length); } return ret; } int cfg80211_mgd_wext_siwap(struct net_device *dev, struct iw_request_info *info, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *bssid = ap_addr->sa_data; int err; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; if (ap_addr->sa_family != ARPHRD_ETHER) return -EINVAL; /* automatic mode */ if (is_zero_ether_addr(bssid) || is_broadcast_ether_addr(bssid)) bssid = NULL; if (wdev->conn) { /* both automatic */ if (!bssid && !wdev->wext.connect.bssid) return 0; /* fixed already - and no change */ if (wdev->wext.connect.bssid && bssid && ether_addr_equal(bssid, wdev->wext.connect.bssid)) return 0; err = cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, false); if (err) return err; } if (bssid) { memcpy(wdev->wext.bssid, bssid, ETH_ALEN); wdev->wext.connect.bssid = wdev->wext.bssid; } else wdev->wext.connect.bssid = NULL; return cfg80211_mgd_wext_connect(rdev, wdev); } int cfg80211_mgd_wext_giwap(struct net_device *dev, struct iw_request_info *info, struct sockaddr *ap_addr, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; /* call only for station! */ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) return -EINVAL; ap_addr->sa_family = ARPHRD_ETHER; if (wdev->valid_links) return -EOPNOTSUPP; if (wdev->links[0].client.current_bss) memcpy(ap_addr->sa_data, wdev->links[0].client.current_bss->pub.bssid, ETH_ALEN); else eth_zero_addr(ap_addr->sa_data); return 0; } int cfg80211_wext_siwgenie(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int ie_len = data->length; u8 *ie = extra; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; if (!ie_len) ie = NULL; guard(wiphy)(wdev->wiphy); /* no change */ if (wdev->wext.ie_len == ie_len && memcmp(wdev->wext.ie, ie, ie_len) == 0) return 0; if (ie_len) { ie = kmemdup(extra, ie_len, GFP_KERNEL); if (!ie) return -ENOMEM; } else { ie = NULL; } kfree(wdev->wext.ie); wdev->wext.ie = ie; wdev->wext.ie_len = ie_len; if (wdev->conn) return cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, false); /* userspace better not think we'll reconnect */ return 0; } int cfg80211_wext_siwmlme(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct iw_mlme *mlme = (struct iw_mlme *)extra; struct cfg80211_registered_device *rdev; if (!wdev) return -EOPNOTSUPP; rdev = wiphy_to_rdev(wdev->wiphy); if (wdev->iftype != NL80211_IFTYPE_STATION) return -EINVAL; if (mlme->addr.sa_family != ARPHRD_ETHER) return -EINVAL; guard(wiphy)(&rdev->wiphy); switch (mlme->cmd) { case IW_MLME_DEAUTH: case IW_MLME_DISASSOC: return cfg80211_disconnect(rdev, dev, mlme->reason_code, true); default: return -EOPNOTSUPP; } } |
| 6 6 2 6 1 7 7 7 6 6 2 6 1 2 1 4 4 1 3 4 4 4 4 4 4 3 3 3 3 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_skbprio.c SKB Priority Queue. * * Authors: Nishanth Devarajan, <ndev2021@gmail.com> * Cody Doucette, <doucette@bu.edu> * original idea by Michel Machado, Cody Doucette, and Qiaobin Fu */ #include <linux/string.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> #include <net/sch_generic.h> #include <net/inet_ecn.h> /* SKB Priority Queue * ================================= * * Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes * packets according to their skb->priority field. Under congestion, * Skbprio drops already-enqueued lower priority packets to make space * available for higher priority packets; it was conceived as a solution * for denial-of-service defenses that need to route packets with different * priorities as a mean to overcome DoS attacks. */ struct skbprio_sched_data { /* Queue state. */ struct sk_buff_head qdiscs[SKBPRIO_MAX_PRIORITY]; struct gnet_stats_queue qstats[SKBPRIO_MAX_PRIORITY]; u16 highest_prio; u16 lowest_prio; }; static u16 calc_new_high_prio(const struct skbprio_sched_data *q) { int prio; for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) { if (!skb_queue_empty(&q->qdiscs[prio])) return prio; } /* SKB queue is empty, return 0 (default highest priority setting). */ return 0; } static u16 calc_new_low_prio(const struct skbprio_sched_data *q) { int prio; for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) { if (!skb_queue_empty(&q->qdiscs[prio])) return prio; } /* SKB queue is empty, return SKBPRIO_MAX_PRIORITY - 1 * (default lowest priority setting). */ return SKBPRIO_MAX_PRIORITY - 1; } static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { const unsigned int max_priority = SKBPRIO_MAX_PRIORITY - 1; struct skbprio_sched_data *q = qdisc_priv(sch); struct sk_buff_head *qdisc; struct sk_buff_head *lp_qdisc; struct sk_buff *to_drop; u16 prio, lp; /* Obtain the priority of @skb. */ prio = min(skb->priority, max_priority); qdisc = &q->qdiscs[prio]; /* sch->limit can change under us from skbprio_change() */ if (sch->q.qlen < READ_ONCE(sch->limit)) { __skb_queue_tail(qdisc, skb); qdisc_qstats_backlog_inc(sch, skb); q->qstats[prio].backlog += qdisc_pkt_len(skb); /* Check to update highest and lowest priorities. */ if (prio > q->highest_prio) q->highest_prio = prio; if (prio < q->lowest_prio) q->lowest_prio = prio; sch->q.qlen++; return NET_XMIT_SUCCESS; } /* If this packet has the lowest priority, drop it. */ lp = q->lowest_prio; if (prio <= lp) { q->qstats[prio].drops++; q->qstats[prio].overlimits++; return qdisc_drop(skb, sch, to_free); } __skb_queue_tail(qdisc, skb); qdisc_qstats_backlog_inc(sch, skb); q->qstats[prio].backlog += qdisc_pkt_len(skb); /* Drop the packet at the tail of the lowest priority qdisc. */ lp_qdisc = &q->qdiscs[lp]; to_drop = __skb_dequeue_tail(lp_qdisc); BUG_ON(!to_drop); qdisc_qstats_backlog_dec(sch, to_drop); qdisc_drop(to_drop, sch, to_free); q->qstats[lp].backlog -= qdisc_pkt_len(to_drop); q->qstats[lp].drops++; q->qstats[lp].overlimits++; /* Check to update highest and lowest priorities. */ if (skb_queue_empty(lp_qdisc)) { if (q->lowest_prio == q->highest_prio) { q->lowest_prio = prio; q->highest_prio = prio; } else { q->lowest_prio = calc_new_low_prio(q); } } if (prio > q->highest_prio) q->highest_prio = prio; return NET_XMIT_CN; } static struct sk_buff *skbprio_dequeue(struct Qdisc *sch) { struct skbprio_sched_data *q = qdisc_priv(sch); struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio]; struct sk_buff *skb = __skb_dequeue(hpq); if (unlikely(!skb)) return NULL; sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); qdisc_bstats_update(sch, skb); q->qstats[q->highest_prio].backlog -= qdisc_pkt_len(skb); /* Update highest priority field. */ if (skb_queue_empty(hpq)) { if (q->lowest_prio == q->highest_prio) { q->highest_prio = 0; q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; } else { q->highest_prio = calc_new_high_prio(q); } } return skb; } static int skbprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct tc_skbprio_qopt *ctl = nla_data(opt); if (opt->nla_len != nla_attr_size(sizeof(*ctl))) return -EINVAL; WRITE_ONCE(sch->limit, ctl->limit); return 0; } static int skbprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct skbprio_sched_data *q = qdisc_priv(sch); int prio; /* Initialise all queues, one for each possible priority. */ for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) __skb_queue_head_init(&q->qdiscs[prio]); memset(&q->qstats, 0, sizeof(q->qstats)); q->highest_prio = 0; q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; sch->limit = 64; if (!opt) return 0; return skbprio_change(sch, opt, extack); } static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct tc_skbprio_qopt opt; opt.limit = READ_ONCE(sch->limit); if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) return -1; return skb->len; } static void skbprio_reset(struct Qdisc *sch) { struct skbprio_sched_data *q = qdisc_priv(sch); int prio; for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) __skb_queue_purge(&q->qdiscs[prio]); memset(&q->qstats, 0, sizeof(q->qstats)); q->highest_prio = 0; q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; } static void skbprio_destroy(struct Qdisc *sch) { struct skbprio_sched_data *q = qdisc_priv(sch); int prio; for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++) __skb_queue_purge(&q->qdiscs[prio]); } static struct Qdisc *skbprio_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; } static unsigned long skbprio_find(struct Qdisc *sch, u32 classid) { return 0; } static int skbprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { tcm->tcm_handle |= TC_H_MIN(cl); return 0; } static int skbprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct skbprio_sched_data *q = qdisc_priv(sch); if (gnet_stats_copy_queue(d, NULL, &q->qstats[cl - 1], q->qstats[cl - 1].qlen) < 0) return -1; return 0; } static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { unsigned int i; if (arg->stop) return; for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) { if (!tc_qdisc_stats_dump(sch, i + 1, arg)) break; } } static const struct Qdisc_class_ops skbprio_class_ops = { .leaf = skbprio_leaf, .find = skbprio_find, .dump = skbprio_dump_class, .dump_stats = skbprio_dump_class_stats, .walk = skbprio_walk, }; static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = { .cl_ops = &skbprio_class_ops, .id = "skbprio", .priv_size = sizeof(struct skbprio_sched_data), .enqueue = skbprio_enqueue, .dequeue = skbprio_dequeue, .peek = qdisc_peek_dequeued, .init = skbprio_init, .reset = skbprio_reset, .change = skbprio_change, .dump = skbprio_dump, .destroy = skbprio_destroy, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("skbprio"); static int __init skbprio_module_init(void) { return register_qdisc(&skbprio_qdisc_ops); } static void __exit skbprio_module_exit(void) { unregister_qdisc(&skbprio_qdisc_ops); } module_init(skbprio_module_init) module_exit(skbprio_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("SKB priority based scheduling qdisc"); |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 | /* * Compressed rom filesystem for Linux. * * Copyright (C) 1999 Linus Torvalds. * * This file is released under the GPL. */ /* * These are the VFS interfaces to the compressed rom filesystem. * The actual compression is based on zlib, see the other files. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> #include <linux/ramfs.h> #include <linux/init.h> #include <linux/string.h> #include <linux/blkdev.h> #include <linux/mtd/mtd.h> #include <linux/mtd/super.h> #include <linux/fs_context.h> #include <linux/slab.h> #include <linux/vfs.h> #include <linux/mutex.h> #include <uapi/linux/cramfs_fs.h> #include <linux/uaccess.h> #include "internal.h" /* * cramfs super-block data in memory */ struct cramfs_sb_info { unsigned long magic; unsigned long size; unsigned long blocks; unsigned long files; unsigned long flags; void *linear_virt_addr; resource_size_t linear_phys_addr; size_t mtd_point_size; }; static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb) { return sb->s_fs_info; } static const struct super_operations cramfs_ops; static const struct inode_operations cramfs_dir_inode_operations; static const struct file_operations cramfs_directory_operations; static const struct file_operations cramfs_physmem_fops; static const struct address_space_operations cramfs_aops; static DEFINE_MUTEX(read_mutex); /* These macros may change in future, to provide better st_ino semantics. */ #define OFFSET(x) ((x)->i_ino) static unsigned long cramino(const struct cramfs_inode *cino, unsigned int offset) { if (!cino->offset) return offset + 1; if (!cino->size) return offset + 1; /* * The file mode test fixes buggy mkcramfs implementations where * cramfs_inode->offset is set to a non zero value for entries * which did not contain data, like devices node and fifos. */ switch (cino->mode & S_IFMT) { case S_IFREG: case S_IFDIR: case S_IFLNK: return cino->offset << 2; default: break; } return offset + 1; } static struct inode *get_cramfs_inode(struct super_block *sb, const struct cramfs_inode *cramfs_inode, unsigned int offset) { struct inode *inode; static struct timespec64 zerotime; inode = iget_locked(sb, cramino(cramfs_inode, offset)); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode_state_read_once(inode) & I_NEW)) return inode; switch (cramfs_inode->mode & S_IFMT) { case S_IFREG: inode->i_fop = &generic_ro_fops; inode->i_data.a_ops = &cramfs_aops; if (IS_ENABLED(CONFIG_CRAMFS_MTD) && CRAMFS_SB(sb)->flags & CRAMFS_FLAG_EXT_BLOCK_POINTERS && CRAMFS_SB(sb)->linear_phys_addr) inode->i_fop = &cramfs_physmem_fops; break; case S_IFDIR: inode->i_op = &cramfs_dir_inode_operations; inode->i_fop = &cramfs_directory_operations; break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); inode->i_data.a_ops = &cramfs_aops; break; case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: init_special_inode(inode, cramfs_inode->mode, old_decode_dev(cramfs_inode->size)); break; default: printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", inode->i_mode, inode->i_ino); iget_failed(inode); return ERR_PTR(-EIO); } inode->i_mode = cramfs_inode->mode; i_uid_write(inode, cramfs_inode->uid); i_gid_write(inode, cramfs_inode->gid); /* if the lower 2 bits are zero, the inode contains data */ if (!(inode->i_ino & 3)) { inode->i_size = cramfs_inode->size; inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; } /* Struct copy intentional */ inode_set_mtime_to_ts(inode, inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, zerotime))); /* inode->i_nlink is left 1 - arguably wrong for directories, but it's the best we can do without reading the directory contents. 1 yields the right result in GNU find, even without -noleaf option. */ unlock_new_inode(inode); return inode; } /* * We have our own block cache: don't fill up the buffer cache * with the rom-image, because the way the filesystem is set * up the accesses should be fairly regular and cached in the * page cache and dentry tree anyway.. * * This also acts as a way to guarantee contiguous areas of up to * BLKS_PER_BUF*PAGE_SIZE, so that the caller doesn't need to * worry about end-of-buffer issues even when decompressing a full * page cache. * * Note: This is all optimized away at compile time when * CONFIG_CRAMFS_BLOCKDEV=n. */ #define READ_BUFFERS (2) /* NEXT_BUFFER(): Loop over [0..(READ_BUFFERS-1)]. */ #define NEXT_BUFFER(_ix) ((_ix) ^ 1) /* * BLKS_PER_BUF_SHIFT should be at least 2 to allow for "compressed" * data that takes up more space than the original and with unlucky * alignment. */ #define BLKS_PER_BUF_SHIFT (2) #define BLKS_PER_BUF (1 << BLKS_PER_BUF_SHIFT) #define BUFFER_SIZE (BLKS_PER_BUF*PAGE_SIZE) static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE]; static unsigned buffer_blocknr[READ_BUFFERS]; static struct super_block *buffer_dev[READ_BUFFERS]; static int next_buffer; /* * Populate our block cache and return a pointer to it. */ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, unsigned int len) { struct address_space *mapping = sb->s_bdev->bd_mapping; struct file_ra_state ra = {}; struct page *pages[BLKS_PER_BUF]; unsigned i, blocknr, buffer; unsigned long devsize; char *data; if (!len) return NULL; blocknr = offset >> PAGE_SHIFT; offset &= PAGE_SIZE - 1; /* Check if an existing buffer already has the data.. */ for (i = 0; i < READ_BUFFERS; i++) { unsigned int blk_offset; if (buffer_dev[i] != sb) continue; if (blocknr < buffer_blocknr[i]) continue; blk_offset = (blocknr - buffer_blocknr[i]) << PAGE_SHIFT; blk_offset += offset; if (blk_offset > BUFFER_SIZE || blk_offset + len > BUFFER_SIZE) continue; return read_buffers[i] + blk_offset; } devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT; /* Ok, read in BLKS_PER_BUF pages completely first. */ file_ra_state_init(&ra, mapping); page_cache_sync_readahead(mapping, &ra, NULL, blocknr, BLKS_PER_BUF); for (i = 0; i < BLKS_PER_BUF; i++) { struct page *page = NULL; if (blocknr + i < devsize) { page = read_mapping_page(mapping, blocknr + i, NULL); /* synchronous error? */ if (IS_ERR(page)) page = NULL; } pages[i] = page; } buffer = next_buffer; next_buffer = NEXT_BUFFER(buffer); buffer_blocknr[buffer] = blocknr; buffer_dev[buffer] = sb; data = read_buffers[buffer]; for (i = 0; i < BLKS_PER_BUF; i++) { struct page *page = pages[i]; if (page) { memcpy_from_page(data, page, 0, PAGE_SIZE); put_page(page); } else memset(data, 0, PAGE_SIZE); data += PAGE_SIZE; } return read_buffers[buffer] + offset; } /* * Return a pointer to the linearly addressed cramfs image in memory. */ static void *cramfs_direct_read(struct super_block *sb, unsigned int offset, unsigned int len) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); if (!len) return NULL; if (len > sbi->size || offset > sbi->size - len) return page_address(ZERO_PAGE(0)); return sbi->linear_virt_addr + offset; } /* * Returns a pointer to a buffer containing at least LEN bytes of * filesystem starting at byte offset OFFSET into the filesystem. */ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned int len) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sbi->linear_virt_addr) return cramfs_direct_read(sb, offset, len); else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) return cramfs_blkdev_read(sb, offset, len); else return NULL; } /* * For a mapping to be possible, we need a range of uncompressed and * contiguous blocks. Return the offset for the first block and number of * valid blocks for which that is true, or zero otherwise. */ static u32 cramfs_get_block_range(struct inode *inode, u32 pgoff, u32 *pages) { struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb); int i; u32 *blockptrs, first_block_addr; /* * We can dereference memory directly here as this code may be * reached only when there is a direct filesystem image mapping * available in memory. */ blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode) + pgoff * 4); first_block_addr = blockptrs[0] & ~CRAMFS_BLK_FLAGS; i = 0; do { u32 block_off = i * (PAGE_SIZE >> CRAMFS_BLK_DIRECT_PTR_SHIFT); u32 expect = (first_block_addr + block_off) | CRAMFS_BLK_FLAG_DIRECT_PTR | CRAMFS_BLK_FLAG_UNCOMPRESSED; if (blockptrs[i] != expect) { pr_debug("range: block %d/%d got %#x expects %#x\n", pgoff+i, pgoff + *pages - 1, blockptrs[i], expect); if (i == 0) return 0; break; } } while (++i < *pages); *pages = i; return first_block_addr << CRAMFS_BLK_DIRECT_PTR_SHIFT; } #ifdef CONFIG_MMU /* * Return true if the last page of a file in the filesystem image contains * some other data that doesn't belong to that file. It is assumed that the * last block is CRAMFS_BLK_FLAG_DIRECT_PTR | CRAMFS_BLK_FLAG_UNCOMPRESSED * (verified by cramfs_get_block_range() and directly accessible in memory. */ static bool cramfs_last_page_is_shared(struct inode *inode) { struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb); u32 partial, last_page, blockaddr, *blockptrs; char *tail_data; partial = offset_in_page(inode->i_size); if (!partial) return false; last_page = inode->i_size >> PAGE_SHIFT; blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode)); blockaddr = blockptrs[last_page] & ~CRAMFS_BLK_FLAGS; blockaddr <<= CRAMFS_BLK_DIRECT_PTR_SHIFT; tail_data = sbi->linear_virt_addr + blockaddr + partial; return memchr_inv(tail_data, 0, PAGE_SIZE - partial) ? true : false; } static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb); unsigned int pages, max_pages, offset; unsigned long address, pgoff = vma->vm_pgoff; char *bailout_reason; int ret; ret = generic_file_readonly_mmap(file, vma); if (ret) return ret; /* * Now try to pre-populate ptes for this vma with a direct * mapping avoiding memory allocation when possible. */ /* Could COW work here? */ bailout_reason = "vma is writable"; if (vma->vm_flags & VM_WRITE) goto bailout; max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; bailout_reason = "beyond file limit"; if (pgoff >= max_pages) goto bailout; pages = min(vma_pages(vma), max_pages - pgoff); offset = cramfs_get_block_range(inode, pgoff, &pages); bailout_reason = "unsuitable block layout"; if (!offset) goto bailout; address = sbi->linear_phys_addr + offset; bailout_reason = "data is not page aligned"; if (!PAGE_ALIGNED(address)) goto bailout; /* Don't map the last page if it contains some other data */ if (pgoff + pages == max_pages && cramfs_last_page_is_shared(inode)) { pr_debug("mmap: %pD: last page is shared\n", file); pages--; } if (!pages) { bailout_reason = "no suitable block remaining"; goto bailout; } if (pages == vma_pages(vma)) { /* * The entire vma is mappable. remap_pfn_range() will * make it distinguishable from a non-direct mapping * in /proc/<pid>/maps by substituting the file offset * with the actual physical address. */ ret = remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT, pages * PAGE_SIZE, vma->vm_page_prot); } else { /* * Let's create a mixed map if we can't map it all. * The normal paging machinery will take care of the * unpopulated ptes via cramfs_read_folio(). */ int i; vm_flags_set(vma, VM_MIXEDMAP); for (i = 0; i < pages && !ret; i++) { vm_fault_t vmf; unsigned long off = i * PAGE_SIZE; vmf = vmf_insert_mixed(vma, vma->vm_start + off, PHYS_PFN(address + off)); if (vmf & VM_FAULT_ERROR) ret = vm_fault_to_errno(vmf, 0); } } if (!ret) pr_debug("mapped %pD[%lu] at 0x%08lx (%u/%lu pages) " "to vma 0x%08lx, page_prot 0x%llx\n", file, pgoff, address, pages, vma_pages(vma), vma->vm_start, (unsigned long long)pgprot_val(vma->vm_page_prot)); return ret; bailout: pr_debug("%pD[%lu]: direct mmap impossible: %s\n", file, pgoff, bailout_reason); /* Didn't manage any direct map, but normal paging is still possible */ return 0; } #else /* CONFIG_MMU */ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) { return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -ENOSYS; } static unsigned long cramfs_physmem_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct cramfs_sb_info *sbi = CRAMFS_SB(sb); unsigned int pages, block_pages, max_pages, offset; pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= max_pages || pages > max_pages - pgoff) return -EINVAL; block_pages = pages; offset = cramfs_get_block_range(inode, pgoff, &block_pages); if (!offset || block_pages != pages) return -ENOSYS; addr = sbi->linear_phys_addr + offset; pr_debug("get_unmapped for %pD ofs %#lx siz %lu at 0x%08lx\n", file, pgoff*PAGE_SIZE, len, addr); return addr; } static unsigned int cramfs_physmem_mmap_capabilities(struct file *file) { return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_EXEC; } #endif /* CONFIG_MMU */ static const struct file_operations cramfs_physmem_fops = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .splice_read = filemap_splice_read, .mmap = cramfs_physmem_mmap, #ifndef CONFIG_MMU .get_unmapped_area = cramfs_physmem_get_unmapped_area, .mmap_capabilities = cramfs_physmem_mmap_capabilities, #endif }; static void cramfs_kill_sb(struct super_block *sb) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); generic_shutdown_super(sb); if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sb->s_mtd) { if (sbi && sbi->mtd_point_size) mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size); put_mtd_device(sb->s_mtd); sb->s_mtd = NULL; } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { sync_blockdev(sb->s_bdev); bdev_fput(sb->s_bdev_file); } kfree(sbi); } static int cramfs_reconfigure(struct fs_context *fc) { sync_filesystem(fc->root->d_sb); fc->sb_flags |= SB_RDONLY; return 0; } static int cramfs_read_super(struct super_block *sb, struct fs_context *fc, struct cramfs_super *super) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); unsigned long root_offset; bool silent = fc->sb_flags & SB_SILENT; /* We don't know the real size yet */ sbi->size = PAGE_SIZE; /* Read the first block and get the superblock from it */ mutex_lock(&read_mutex); memcpy(super, cramfs_read(sb, 0, sizeof(*super)), sizeof(*super)); mutex_unlock(&read_mutex); /* Do sanity checks on the superblock */ if (super->magic != CRAMFS_MAGIC) { /* check for wrong endianness */ if (super->magic == CRAMFS_MAGIC_WEND) { if (!silent) errorfc(fc, "wrong endianness"); return -EINVAL; } /* check at 512 byte offset */ mutex_lock(&read_mutex); memcpy(super, cramfs_read(sb, 512, sizeof(*super)), sizeof(*super)); mutex_unlock(&read_mutex); if (super->magic != CRAMFS_MAGIC) { if (super->magic == CRAMFS_MAGIC_WEND && !silent) errorfc(fc, "wrong endianness"); else if (!silent) errorfc(fc, "wrong magic"); return -EINVAL; } } /* get feature flags first */ if (super->flags & ~CRAMFS_SUPPORTED_FLAGS) { errorfc(fc, "unsupported filesystem features"); return -EINVAL; } /* Check that the root inode is in a sane state */ if (!S_ISDIR(super->root.mode)) { errorfc(fc, "root is not a directory"); return -EINVAL; } /* correct strange, hard-coded permissions of mkcramfs */ super->root.mode |= 0555; root_offset = super->root.offset << 2; if (super->flags & CRAMFS_FLAG_FSID_VERSION_2) { sbi->size = super->size; sbi->blocks = super->fsid.blocks; sbi->files = super->fsid.files; } else { sbi->size = 1<<28; sbi->blocks = 0; sbi->files = 0; } sbi->magic = super->magic; sbi->flags = super->flags; if (root_offset == 0) infofc(fc, "empty filesystem"); else if (!(super->flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) && ((root_offset != sizeof(struct cramfs_super)) && (root_offset != 512 + sizeof(struct cramfs_super)))) { errorfc(fc, "bad root offset %lu", root_offset); return -EINVAL; } return 0; } static int cramfs_finalize_super(struct super_block *sb, struct cramfs_inode *cramfs_root) { struct inode *root; /* Set it all up.. */ sb->s_flags |= SB_RDONLY; sb->s_time_min = 0; sb->s_time_max = 0; sb->s_op = &cramfs_ops; root = get_cramfs_inode(sb, cramfs_root, 0); if (IS_ERR(root)) return PTR_ERR(root); sb->s_root = d_make_root(root); if (!sb->s_root) return -ENOMEM; return 0; } static int cramfs_blkdev_fill_super(struct super_block *sb, struct fs_context *fc) { struct cramfs_sb_info *sbi; struct cramfs_super super; int i, err; sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; /* Invalidate the read buffers on mount: think disk change.. */ for (i = 0; i < READ_BUFFERS; i++) buffer_blocknr[i] = -1; err = cramfs_read_super(sb, fc, &super); if (err) return err; return cramfs_finalize_super(sb, &super.root); } static int cramfs_mtd_fill_super(struct super_block *sb, struct fs_context *fc) { struct cramfs_sb_info *sbi; struct cramfs_super super; int err; sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; /* Map only one page for now. Will remap it when fs size is known. */ err = mtd_point(sb->s_mtd, 0, PAGE_SIZE, &sbi->mtd_point_size, &sbi->linear_virt_addr, &sbi->linear_phys_addr); if (err || sbi->mtd_point_size != PAGE_SIZE) { pr_err("unable to get direct memory access to mtd:%s\n", sb->s_mtd->name); return err ? : -ENODATA; } pr_info("checking physical address %pap for linear cramfs image\n", &sbi->linear_phys_addr); err = cramfs_read_super(sb, fc, &super); if (err) return err; /* Remap the whole filesystem now */ pr_info("linear cramfs image on mtd:%s appears to be %lu KB in size\n", sb->s_mtd->name, sbi->size/1024); mtd_unpoint(sb->s_mtd, 0, PAGE_SIZE); err = mtd_point(sb->s_mtd, 0, sbi->size, &sbi->mtd_point_size, &sbi->linear_virt_addr, &sbi->linear_phys_addr); if (err || sbi->mtd_point_size != sbi->size) { pr_err("unable to get direct memory access to mtd:%s\n", sb->s_mtd->name); return err ? : -ENODATA; } return cramfs_finalize_super(sb, &super.root); } static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; u64 id = 0; if (sb->s_bdev) id = huge_encode_dev(sb->s_bdev->bd_dev); else if (sb->s_dev) id = huge_encode_dev(sb->s_dev); buf->f_type = CRAMFS_MAGIC; buf->f_bsize = PAGE_SIZE; buf->f_blocks = CRAMFS_SB(sb)->blocks; buf->f_bfree = 0; buf->f_bavail = 0; buf->f_files = CRAMFS_SB(sb)->files; buf->f_ffree = 0; buf->f_fsid = u64_to_fsid(id); buf->f_namelen = CRAMFS_MAXPATHLEN; return 0; } /* * Read a cramfs directory entry. */ static int cramfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; char *buf; unsigned int offset; /* Offset within the thing. */ if (ctx->pos >= inode->i_size) return 0; offset = ctx->pos; /* Directory entries are always 4-byte aligned */ if (offset & 3) return -EINVAL; buf = kmalloc(CRAMFS_MAXPATHLEN, GFP_KERNEL); if (!buf) return -ENOMEM; while (offset < inode->i_size) { struct cramfs_inode *de; unsigned long nextoffset; char *name; ino_t ino; umode_t mode; int namelen; mutex_lock(&read_mutex); de = cramfs_read(sb, OFFSET(inode) + offset, sizeof(*de)+CRAMFS_MAXPATHLEN); name = (char *)(de+1); /* * Namelengths on disk are shifted by two * and the name padded out to 4-byte boundaries * with zeroes. */ namelen = de->namelen << 2; memcpy(buf, name, namelen); ino = cramino(de, OFFSET(inode) + offset); mode = de->mode; mutex_unlock(&read_mutex); nextoffset = offset + sizeof(*de) + namelen; for (;;) { if (!namelen) { kfree(buf); return -EIO; } if (buf[namelen-1]) break; namelen--; } if (!dir_emit(ctx, buf, namelen, ino, mode >> 12)) break; ctx->pos = offset = nextoffset; } kfree(buf); return 0; } /* * Lookup and fill in the inode data.. */ static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned int offset = 0; struct inode *inode = NULL; int sorted; mutex_lock(&read_mutex); sorted = CRAMFS_SB(dir->i_sb)->flags & CRAMFS_FLAG_SORTED_DIRS; while (offset < dir->i_size) { struct cramfs_inode *de; char *name; int namelen, retval; int dir_off = OFFSET(dir) + offset; de = cramfs_read(dir->i_sb, dir_off, sizeof(*de)+CRAMFS_MAXPATHLEN); name = (char *)(de+1); /* Try to take advantage of sorted directories */ if (sorted && (dentry->d_name.name[0] < name[0])) break; namelen = de->namelen << 2; offset += sizeof(*de) + namelen; /* Quick check that the name is roughly the right length */ if (((dentry->d_name.len + 3) & ~3) != namelen) continue; for (;;) { if (!namelen) { inode = ERR_PTR(-EIO); goto out; } if (name[namelen-1]) break; namelen--; } if (namelen != dentry->d_name.len) continue; retval = memcmp(dentry->d_name.name, name, namelen); if (retval > 0) continue; if (!retval) { inode = get_cramfs_inode(dir->i_sb, de, dir_off); break; } /* else (retval < 0) */ if (sorted) break; } out: mutex_unlock(&read_mutex); return d_splice_alias(inode, dentry); } static int cramfs_read_folio(struct file *file, struct folio *folio) { struct inode *inode = folio->mapping->host; u32 maxblock; int bytes_filled; void *pgdata; bool success = false; maxblock = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; bytes_filled = 0; pgdata = kmap_local_folio(folio, 0); if (folio->index < maxblock) { struct super_block *sb = inode->i_sb; u32 blkptr_offset = OFFSET(inode) + folio->index * 4; u32 block_ptr, block_start, block_len; bool uncompressed, direct; mutex_lock(&read_mutex); block_ptr = *(u32 *) cramfs_read(sb, blkptr_offset, 4); uncompressed = (block_ptr & CRAMFS_BLK_FLAG_UNCOMPRESSED); direct = (block_ptr & CRAMFS_BLK_FLAG_DIRECT_PTR); block_ptr &= ~CRAMFS_BLK_FLAGS; if (direct) { /* * The block pointer is an absolute start pointer, * shifted by 2 bits. The size is included in the * first 2 bytes of the data block when compressed, * or PAGE_SIZE otherwise. */ block_start = block_ptr << CRAMFS_BLK_DIRECT_PTR_SHIFT; if (uncompressed) { block_len = PAGE_SIZE; /* if last block: cap to file length */ if (folio->index == maxblock - 1) block_len = offset_in_page(inode->i_size); } else { block_len = *(u16 *) cramfs_read(sb, block_start, 2); block_start += 2; } } else { /* * The block pointer indicates one past the end of * the current block (start of next block). If this * is the first block then it starts where the block * pointer table ends, otherwise its start comes * from the previous block's pointer. */ block_start = OFFSET(inode) + maxblock * 4; if (folio->index) block_start = *(u32 *) cramfs_read(sb, blkptr_offset - 4, 4); /* Beware... previous ptr might be a direct ptr */ if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) { /* See comments on earlier code. */ u32 prev_start = block_start; block_start = prev_start & ~CRAMFS_BLK_FLAGS; block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT; if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) { block_start += PAGE_SIZE; } else { block_len = *(u16 *) cramfs_read(sb, block_start, 2); block_start += 2 + block_len; } } block_start &= ~CRAMFS_BLK_FLAGS; block_len = block_ptr - block_start; } if (block_len == 0) ; /* hole */ else if (unlikely(block_len > 2*PAGE_SIZE || (uncompressed && block_len > PAGE_SIZE))) { mutex_unlock(&read_mutex); pr_err("bad data blocksize %u\n", block_len); goto err; } else if (uncompressed) { memcpy(pgdata, cramfs_read(sb, block_start, block_len), block_len); bytes_filled = block_len; } else { bytes_filled = cramfs_uncompress_block(pgdata, PAGE_SIZE, cramfs_read(sb, block_start, block_len), block_len); } mutex_unlock(&read_mutex); if (unlikely(bytes_filled < 0)) goto err; } memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled); flush_dcache_folio(folio); success = true; err: kunmap_local(pgdata); folio_end_read(folio, success); return 0; } static const struct address_space_operations cramfs_aops = { .read_folio = cramfs_read_folio }; /* * Our operations: */ /* * A directory can only readdir */ static const struct file_operations cramfs_directory_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = cramfs_readdir, }; static const struct inode_operations cramfs_dir_inode_operations = { .lookup = cramfs_lookup, }; static const struct super_operations cramfs_ops = { .statfs = cramfs_statfs, }; static int cramfs_get_tree(struct fs_context *fc) { int ret = -ENOPROTOOPT; if (IS_ENABLED(CONFIG_CRAMFS_MTD)) { ret = get_tree_mtd(fc, cramfs_mtd_fill_super); if (!ret) return 0; } if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) ret = get_tree_bdev(fc, cramfs_blkdev_fill_super); return ret; } static const struct fs_context_operations cramfs_context_ops = { .get_tree = cramfs_get_tree, .reconfigure = cramfs_reconfigure, }; /* * Set up the filesystem mount context. */ static int cramfs_init_fs_context(struct fs_context *fc) { fc->ops = &cramfs_context_ops; return 0; } static struct file_system_type cramfs_fs_type = { .owner = THIS_MODULE, .name = "cramfs", .init_fs_context = cramfs_init_fs_context, .kill_sb = cramfs_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("cramfs"); static int __init init_cramfs_fs(void) { int rv; rv = cramfs_uncompress_init(); if (rv < 0) return rv; rv = register_filesystem(&cramfs_fs_type); if (rv < 0) cramfs_uncompress_exit(); return rv; } static void __exit exit_cramfs_fs(void) { cramfs_uncompress_exit(); unregister_filesystem(&cramfs_fs_type); } module_init(init_cramfs_fs) module_exit(exit_cramfs_fs) MODULE_DESCRIPTION("Compressed ROM file system support"); MODULE_LICENSE("GPL"); |
| 94 2 79 84 5 4 3 87 6 81 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | // SPDX-License-Identifier: GPL-2.0-only /* * VMware vSockets Driver * * Copyright (C) 2007-2012 VMware, Inc. All rights reserved. */ #include <linux/types.h> #include <linux/socket.h> #include <linux/stddef.h> #include <net/sock.h> #include <net/vsock_addr.h> void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port) { memset(addr, 0, sizeof(*addr)); addr->svm_family = AF_VSOCK; addr->svm_cid = cid; addr->svm_port = port; } EXPORT_SYMBOL_GPL(vsock_addr_init); int vsock_addr_validate(const struct sockaddr_vm *addr) { __u8 svm_valid_flags = VMADDR_FLAG_TO_HOST; if (!addr) return -EFAULT; if (addr->svm_family != AF_VSOCK) return -EAFNOSUPPORT; if (addr->svm_flags & ~svm_valid_flags) return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(vsock_addr_validate); bool vsock_addr_bound(const struct sockaddr_vm *addr) { return addr->svm_port != VMADDR_PORT_ANY; } EXPORT_SYMBOL_GPL(vsock_addr_bound); void vsock_addr_unbind(struct sockaddr_vm *addr) { vsock_addr_init(addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); } EXPORT_SYMBOL_GPL(vsock_addr_unbind); bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, const struct sockaddr_vm *other) { return addr->svm_cid == other->svm_cid && addr->svm_port == other->svm_port; } EXPORT_SYMBOL_GPL(vsock_addr_equals_addr); int vsock_addr_cast(const struct sockaddr *addr, size_t len, struct sockaddr_vm **out_addr) { if (len < sizeof(**out_addr)) return -EFAULT; *out_addr = (struct sockaddr_vm *)addr; return vsock_addr_validate(*out_addr); } EXPORT_SYMBOL_GPL(vsock_addr_cast); |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Manage send buffer * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #ifndef SMC_TX_H #define SMC_TX_H #include <linux/socket.h> #include <linux/types.h> #include "smc.h" #include "smc_cdc.h" static inline int smc_tx_prepared_sends(struct smc_connection *conn) { union smc_host_cursor sent, prep; smc_curs_copy(&sent, &conn->tx_curs_sent, conn); smc_curs_copy(&prep, &conn->tx_curs_prep, conn); return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } void smc_tx_pending(struct smc_connection *conn); void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); void smc_tx_consumer_update(struct smc_connection *conn, bool force); int smcd_tx_ism_write(struct smc_connection *conn, void *data, size_t len, u32 offset, int signal); #endif /* SMC_TX_H */ |
| 37 289 2 979 973 438 375 108 108 375 108 65 271 2 319 319 319 283 50 281 281 44 44 34 20 14 16 882 882 2 2 2 2 2 25 25 25 558 24 24 23 1 1728 1721 36 31 2 1 33 33 1 28 27 1 1 55 5 97 97 55 56 1 12 42 40 1 1598 93 94 5 60 21 39 32 61 48 94 94 51 68 93 1685 1614 815 906 914 94 1686 1 1695 1690 42 1620 237 1634 8 1694 1699 1686 1694 41 1692 33 5 3 1690 1697 9 1826 981 1694 1697 1688 1690 25 24 4 22 4 4 14 18 4 4 3 7 2 8 641 263 265 92 92 36 5 55 92 277 338 372 872 918 917 922 917 874 61 918 1089 1082 207 207 23 208 33 33 126 129 128 152 152 151 152 8 109 19 103 1 152 7 145 97 48 102 87 115 50 49 50 29 21 1 22 765 765 765 765 761 751 26 2 1596 20 20 769 765 766 5 5 5 249 244 1 11 11 11 1727 1718 1717 244 226 13 13 12 13 634 1761 13 12 13 1 13 11 11 11 11 579 579 579 7 7 7 2 7 2 564 2 2 566 1727 1727 1722 23 1721 5 4 1 1 5 4 5 3 2 52 52 4 4 3 3 1 1 1 1 1 1 2 52 52 2 2 121 60 76 76 76 75 54 76 121 118 1833 1836 1641 196 983 1799 1803 1798 20 1597 37 1596 769 1790 1772 14 1787 767 771 400 126 3 173 101 301 4 397 399 756 760 301 758 758 760 1190 738 1190 356 995 66 852 1190 1193 1194 1189 1188 16 16 558 542 8 8 558 10 558 559 2 558 560 559 558 8 559 117 116 117 4 116 48 82 82 117 10 10 10 10 10 10 10 10 10 19 19 1 3 16 7 12 10 10 10 9 8 2 10 4 4 13 15 1 16 6 6 6 6 3 4 3 2 83 83 83 8 8 11 18 18 18 18 1 17 18 5 10 18 15 17 2 16 16 182 183 182 183 182 954 957 253 254 254 17 16 2 18 14 13 1 2 6 7 5 2 3 11 4 7 79 40 10 31 18 14 31 31 3 2 58 1 38 666 782 5 4 4 748 5 14 24 72 35 36 1 1 37 37 715 80 682 41 681 6 682 709 739 745 773 119 14 14 14 14 14 14 1 7 8 8 706 1 3 4 6 3 690 1 757 694 52 12 705 3 717 40 717 46 650 101 753 5 5 1 4 4 105 690 688 3 2 108 5 5 682 13 2 98 612 555 1 555 555 549 4 4 1 4 1 1 1 1 1 1 1 4 4 1 1 1 8 1 25 2 23 6 21 1 13 8 8 13 12 9 8 7 3 4 15 18 18 18 1 5 3 9 8 5 2 1 5 3 3 3 3 3 6 7 7 6 4 7 6 7 7 47 48 48 29 41 39 48 24 24 60 1 28 24 10 72 71 72 72 29 37 6 66 72 39 29 29 7 560 559 12 2 12 12 12 1 1 1 1 1 3 3 3 3 1 3 3 1 3 1 3 3 1 3 1 3 3 3 3 683 683 3 3 3 692 693 668 692 578 526 1 1 1 1 2 2 2 2 1 1 1 1 1 1122 1123 722 1 504 2 1 1 228 60 234 1124 976 978 979 175 175 52 51 52 175 175 14 175 175 37 2 2 16 11 12 1 34 2 32 138 2 2 2 1 101 31 125 7 121 11 117 3 4 7 8 100 25 2 106 2 14 117 2 2 116 3 97 22 112 7 108 10 112 6 80 37 109 4 88 24 86 27 104 3 3 11 1 11 7 7 7 7 7 7 7 7 7 6 1 7 5 20 11 4 17 9 3 8 12 1 15 11 1 7 2 7 7 2 20 11 11 6 4 5 4 5 1 32 15 15 2 2 2 6 9 107 13 5 89 25 69 6 9 1046 6 1043 1044 3 3 3 3 1044 12 12 1091 16 1090 3 1090 17 138 1008 17 1073 4 1071 10 1082 1082 7 13 13 3 3 3 5 8 12 13 1067 1057 40 1073 1029 489 1072 11 5 1086 24 6 7 21 21 21 2 2 36 34 34 26 12 13 6 10 20 3 27 9 21 6 21 21 21 1 30 39 38 8 11 1 4 23 18 3 19 1 19 19 16 6 70 70 33 33 3 4 21 2 14 16 17 32 3 31 4 5 30 30 5 33 2 30 5 34 16 2 14 14 18 27 20 20 3 17 4 16 1046 1042 1047 1048 2619 2557 333 178 1 178 178 178 178 178 178 178 178 178 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * FIB front-end. * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ /* Changes: * * YOSHIFUJI Hideaki @USAGI * reworked default router selection. * - respect outgoing interface * - select from (probably) reachable routers (i.e. * routers in REACHABLE, STALE, DELAY or PROBE states). * - always select the same router if it is (probably) * reachable. otherwise, round-robin the list. * Ville Nuorvala * Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/capability.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/types.h> #include <linux/times.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/mroute6.h> #include <linux/init.h> #include <linux/if_arp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/jhash.h> #include <linux/siphash.h> #include <net/net_namespace.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/tcp.h> #include <linux/rtnetlink.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/xfrm.h> #include <net/netevent.h> #include <net/netlink.h> #include <net/rtnh.h> #include <net/lwtunnel.h> #include <net/ip_tunnels.h> #include <net/l3mdev.h> #include <net/ip.h> #include <linux/uaccess.h> #include <linux/btf_ids.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif static int ip6_rt_type_to_error(u8 fib6_type); #define CREATE_TRACE_POINTS #include <trace/events/fib6.h> EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup); #undef CREATE_TRACE_POINTS enum rt6_nud_state { RT6_NUD_FAIL_HARD = -3, RT6_NUD_FAIL_PROBE = -2, RT6_NUD_FAIL_DO_RR = -1, RT6_NUD_SUCCEED = 1 }; INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ip6_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst); static void ip6_negative_advice(struct sock *sk, struct dst_entry *dst); static void ip6_dst_destroy(struct dst_entry *); static void ip6_dst_ifdown(struct dst_entry *, struct net_device *dev); static void ip6_dst_gc(struct dst_ops *ops); static int ip6_pkt_discard(struct sk_buff *skb); static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb); static int ip6_pkt_prohibit(struct sk_buff *skb); static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb); static void ip6_link_failure(struct sk_buff *skb); static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, int strict); static size_t rt6_nlmsg_size(struct fib6_info *f6i); static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr); #ifdef CONFIG_IPV6_ROUTE_INFO static struct fib6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev); #endif struct uncached_list { spinlock_t lock; struct list_head head; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list); void rt6_uncached_list_add(struct rt6_info *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list); rt->dst.rt_uncached_list = ul; spin_lock_bh(&ul->lock); list_add_tail(&rt->dst.rt_uncached, &ul->head); spin_unlock_bh(&ul->lock); } void rt6_uncached_list_del(struct rt6_info *rt) { if (!list_empty(&rt->dst.rt_uncached)) { struct uncached_list *ul = rt->dst.rt_uncached_list; spin_lock_bh(&ul->lock); list_del_init(&rt->dst.rt_uncached); spin_unlock_bh(&ul->lock); } } static void rt6_uncached_list_flush_dev(struct net_device *dev) { int cpu; for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); struct rt6_info *rt, *safe; if (list_empty(&ul->head)) continue; spin_lock_bh(&ul->lock); list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { struct inet6_dev *rt_idev = rt->rt6i_idev; struct net_device *rt_dev = rt->dst.dev; bool handled = false; if (rt_idev && rt_idev->dev == dev) { rt->rt6i_idev = in6_dev_get(blackhole_netdev); in6_dev_put(rt_idev); handled = true; } if (rt_dev == dev) { rt->dst.dev = blackhole_netdev; netdev_ref_replace(rt_dev, blackhole_netdev, &rt->dst.dev_tracker, GFP_ATOMIC); handled = true; } if (handled) list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } } static inline const void *choose_neigh_daddr(const struct in6_addr *p, struct sk_buff *skb, const void *daddr) { if (!ipv6_addr_any(p)) return (const void *) p; else if (skb) return &ipv6_hdr(skb)->daddr; return daddr; } struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, const void *daddr) { struct neighbour *n; daddr = choose_neigh_daddr(gw, skb, daddr); n = __ipv6_neigh_lookup(dev, daddr); if (n) return n; n = neigh_create(&nd_tbl, daddr, dev); return IS_ERR(n) ? NULL : n; } static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct rt6_info *rt = dst_rt6_info(dst); return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), dst_dev(dst), skb, daddr); } static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct rt6_info *rt = dst_rt6_info(dst); struct net_device *dev = dst_dev(dst); daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); if (!daddr) return; if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) return; if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) return; __ipv6_confirm_neigh(dev, daddr); } static struct dst_ops ip6_dst_ops_template = { .family = AF_INET6, .gc = ip6_dst_gc, .gc_thresh = 1024, .check = ip6_dst_check, .default_advmss = ip6_default_advmss, .mtu = ip6_mtu, .cow_metrics = dst_cow_metrics_generic, .destroy = ip6_dst_destroy, .ifdown = ip6_dst_ifdown, .negative_advice = ip6_negative_advice, .link_failure = ip6_link_failure, .update_pmtu = ip6_rt_update_pmtu, .redirect = rt6_do_redirect, .local_out = __ip6_local_out, .neigh_lookup = ip6_dst_neigh_lookup, .confirm_neigh = ip6_confirm_neigh, }; static struct dst_ops ip6_dst_blackhole_ops = { .family = AF_INET6, .default_advmss = ip6_default_advmss, .neigh_lookup = ip6_dst_neigh_lookup, .check = ip6_dst_check, .destroy = ip6_dst_destroy, .cow_metrics = dst_cow_metrics_generic, .update_pmtu = dst_blackhole_update_pmtu, .redirect = dst_blackhole_redirect, .mtu = dst_blackhole_mtu, }; static const u32 ip6_template_metrics[RTAX_MAX] = { [RTAX_HOPLIMIT - 1] = 0, }; static const struct fib6_info fib6_null_entry_template = { .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP), .fib6_protocol = RTPROT_KERNEL, .fib6_metric = ~(u32)0, .fib6_ref = REFCOUNT_INIT(1), .fib6_type = RTN_UNREACHABLE, .fib6_metrics = (struct dst_metrics *)&dst_default_metrics, }; static const struct rt6_info ip6_null_entry_template = { .dst = { .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -ENETUNREACH, .input = ip6_pkt_discard, .output = ip6_pkt_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), }; #ifdef CONFIG_IPV6_MULTIPLE_TABLES static const struct rt6_info ip6_prohibit_entry_template = { .dst = { .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EACCES, .input = ip6_pkt_prohibit, .output = ip6_pkt_prohibit_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), }; static const struct rt6_info ip6_blk_hole_entry_template = { .dst = { .__rcuref = RCUREF_INIT(1), .__use = 1, .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, .input = dst_discard, .output = dst_discard_out, }, .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), }; #endif static void rt6_info_init(struct rt6_info *rt) { memset_after(rt, 0, dst); } /* allocate dst with ip6_dst_ops */ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, DST_OBSOLETE_FORCE_CHK, flags); if (rt) { rt6_info_init(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); } return rt; } EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = dst_rt6_info(dst); struct fib6_info *from; struct inet6_dev *idev; ip_dst_metrics_put(dst); rt6_uncached_list_del(rt); idev = rt->rt6i_idev; if (idev) { rt->rt6i_idev = NULL; in6_dev_put(idev); } from = unrcu_pointer(xchg(&rt->from, NULL)); fib6_info_release(from); } static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { struct rt6_info *rt = dst_rt6_info(dst); struct inet6_dev *idev = rt->rt6i_idev; struct fib6_info *from; if (idev && idev->dev != blackhole_netdev) { struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev); if (blackhole_idev) { rt->rt6i_idev = blackhole_idev; in6_dev_put(idev); } } from = unrcu_pointer(xchg(&rt->from, NULL)); fib6_info_release(from); } static bool __rt6_check_expired(const struct rt6_info *rt) { if (rt->rt6i_flags & RTF_EXPIRES) return time_after(jiffies, READ_ONCE(rt->dst.expires)); return false; } static bool rt6_check_expired(const struct rt6_info *rt) { struct fib6_info *from; from = rcu_dereference(rt->from); if (rt->rt6i_flags & RTF_EXPIRES) { if (time_after(jiffies, READ_ONCE(rt->dst.expires))) return true; } else if (from) { return READ_ONCE(rt->dst.obsolete) != DST_OBSOLETE_FORCE_CHK || fib6_check_expired(from); } return false; } static struct fib6_info * rt6_multipath_first_sibling_rcu(const struct fib6_info *rt) { struct fib6_info *iter; struct fib6_node *fn; fn = rcu_dereference(rt->fib6_node); if (!fn) goto out; iter = rcu_dereference(fn->leaf); if (!iter) goto out; while (iter) { if (iter->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(iter)) return iter; iter = rcu_dereference(iter->fib6_next); } out: return NULL; } void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { struct fib6_info *first, *match = res->f6i; struct fib6_info *sibling; int hash; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) goto out; if (match->nh && have_oif_match && res->nh) return; if (skb) IP6CB(skb)->flags |= IP6SKB_MULTIPATH; /* We might have already computed the hash for ICMPv6 errors. In such * case it will always be non-zero. Otherwise now is the time to do it. */ if (!fl6->mp_hash && (!match->nh || nexthop_is_multipath(match->nh))) fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL); if (unlikely(match->nh)) { nexthop_path_fib6_result(res, fl6->mp_hash); return; } first = rt6_multipath_first_sibling_rcu(match); if (!first) goto out; hash = fl6->mp_hash; if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) { if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif, strict) >= 0) match = first; goto out; } list_for_each_entry_rcu(sibling, &first->fib6_siblings, fib6_siblings) { const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); if (hash > nh_upper_bound) continue; if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) break; match = sibling; break; } out: res->f6i = match; res->nh = match->fib6_nh; } /* * Route lookup. rcu_read_lock() should be held. */ static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh, const struct in6_addr *saddr, int oif, int flags) { const struct net_device *dev; if (nh->fib_nh_flags & RTNH_F_DEAD) return false; dev = nh->fib_nh_dev; if (oif) { if (dev->ifindex == oif) return true; } else { if (ipv6_chk_addr(net, saddr, dev, flags & RT6_LOOKUP_F_IFACE)) return true; } return false; } struct fib6_nh_dm_arg { struct net *net; const struct in6_addr *saddr; int oif; int flags; struct fib6_nh *nh; }; static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_dm_arg *arg = _arg; arg->nh = nh; return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif, arg->flags); } /* returns fib6_nh from nexthop or NULL */ static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh, struct fib6_result *res, const struct in6_addr *saddr, int oif, int flags) { struct fib6_nh_dm_arg arg = { .net = net, .saddr = saddr, .oif = oif, .flags = flags, }; if (nexthop_is_blackhole(nh)) return NULL; if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg)) return arg.nh; return NULL; } static void rt6_device_match(struct net *net, struct fib6_result *res, const struct in6_addr *saddr, int oif, int flags) { struct fib6_info *f6i = res->f6i; struct fib6_info *spf6i; struct fib6_nh *nh; if (!oif && ipv6_addr_any(saddr)) { if (unlikely(f6i->nh)) { nh = nexthop_fib6_nh(f6i->nh); if (nexthop_is_blackhole(f6i->nh)) goto out_blackhole; } else { nh = f6i->fib6_nh; } if (!(nh->fib_nh_flags & RTNH_F_DEAD)) goto out; } for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) { bool matched = false; if (unlikely(spf6i->nh)) { nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr, oif, flags); if (nh) matched = true; } else { nh = spf6i->fib6_nh; if (__rt6_device_match(net, nh, saddr, oif, flags)) matched = true; } if (matched) { res->f6i = spf6i; goto out; } } if (oif && flags & RT6_LOOKUP_F_IFACE) { res->f6i = net->ipv6.fib6_null_entry; nh = res->f6i->fib6_nh; goto out; } if (unlikely(f6i->nh)) { nh = nexthop_fib6_nh(f6i->nh); if (nexthop_is_blackhole(f6i->nh)) goto out_blackhole; } else { nh = f6i->fib6_nh; } if (nh->fib_nh_flags & RTNH_F_DEAD) { res->f6i = net->ipv6.fib6_null_entry; nh = res->f6i->fib6_nh; } out: res->nh = nh; res->fib6_type = res->f6i->fib6_type; res->fib6_flags = res->f6i->fib6_flags; return; out_blackhole: res->fib6_flags |= RTF_REJECT; res->fib6_type = RTN_BLACKHOLE; res->nh = nh; } #ifdef CONFIG_IPV6_ROUTER_PREF struct __rt6_probe_work { struct work_struct work; struct in6_addr target; struct net_device *dev; netdevice_tracker dev_tracker; }; static void rt6_probe_deferred(struct work_struct *w) { struct in6_addr mcaddr; struct __rt6_probe_work *work = container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0); netdev_put(work->dev, &work->dev_tracker); kfree(work); } static void rt6_probe(struct fib6_nh *fib6_nh) { struct __rt6_probe_work *work = NULL; const struct in6_addr *nh_gw; unsigned long last_probe; struct neighbour *neigh; struct net_device *dev; struct inet6_dev *idev; /* * Okay, this does not seem to be appropriate * for now, however, we need to check if it * is really so; aka Router Reachability Probing. * * Router Reachability Probe MUST be rate-limited * to no more than one per minute. */ if (!fib6_nh->fib_nh_gw_family) return; nh_gw = &fib6_nh->fib_nh_gw6; dev = fib6_nh->fib_nh_dev; rcu_read_lock(); last_probe = READ_ONCE(fib6_nh->last_probe); idev = __in6_dev_get(dev); if (!idev) goto out; neigh = __ipv6_neigh_lookup_noref(dev, nh_gw); if (neigh) { if (READ_ONCE(neigh->nud_state) & NUD_VALID) goto out; write_lock_bh(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && time_after(jiffies, neigh->updated + READ_ONCE(idev->cnf.rtr_probe_interval))) { work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) __neigh_set_probe_once(neigh); } write_unlock_bh(&neigh->lock); } else if (time_after(jiffies, last_probe + READ_ONCE(idev->cnf.rtr_probe_interval))) { work = kmalloc(sizeof(*work), GFP_ATOMIC); } if (!work || cmpxchg(&fib6_nh->last_probe, last_probe, jiffies) != last_probe) { kfree(work); } else { INIT_WORK(&work->work, rt6_probe_deferred); work->target = *nh_gw; netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC); work->dev = dev; schedule_work(&work->work); } out: rcu_read_unlock(); } #else static inline void rt6_probe(struct fib6_nh *fib6_nh) { } #endif /* * Default Router Selection (RFC 2461 6.3.6) */ static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh) { enum rt6_nud_state ret = RT6_NUD_FAIL_HARD; struct neighbour *neigh; rcu_read_lock(); neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev, &fib6_nh->fib_nh_gw6); if (neigh) { u8 nud_state = READ_ONCE(neigh->nud_state); if (nud_state & NUD_VALID) ret = RT6_NUD_SUCCEED; #ifdef CONFIG_IPV6_ROUTER_PREF else if (!(nud_state & NUD_FAILED)) ret = RT6_NUD_SUCCEED; else ret = RT6_NUD_FAIL_PROBE; #endif } else { ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ? RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR; } rcu_read_unlock(); return ret; } static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif, int strict) { int m = 0; if (!oif || nh->fib_nh_dev->ifindex == oif) m = 2; if (!m && (strict & RT6_LOOKUP_F_IFACE)) return RT6_NUD_FAIL_HARD; #ifdef CONFIG_IPV6_ROUTER_PREF m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2; #endif if ((strict & RT6_LOOKUP_F_REACHABLE) && !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) { int n = rt6_check_neigh(nh); if (n < 0) return n; } return m; } static bool find_match(struct fib6_nh *nh, u32 fib6_flags, int oif, int strict, int *mpri, bool *do_rr) { bool match_do_rr = false; bool rc = false; int m; if (nh->fib_nh_flags & RTNH_F_DEAD) goto out; if (ip6_ignore_linkdown(nh->fib_nh_dev) && nh->fib_nh_flags & RTNH_F_LINKDOWN && !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE)) goto out; m = rt6_score_route(nh, fib6_flags, oif, strict); if (m == RT6_NUD_FAIL_DO_RR) { match_do_rr = true; m = 0; /* lowest valid score */ } else if (m == RT6_NUD_FAIL_HARD) { goto out; } if (strict & RT6_LOOKUP_F_REACHABLE) rt6_probe(nh); /* note that m can be RT6_NUD_FAIL_PROBE at this point */ if (m > *mpri) { *do_rr = match_do_rr; *mpri = m; rc = true; } out: return rc; } struct fib6_nh_frl_arg { u32 flags; int oif; int strict; int *mpri; bool *do_rr; struct fib6_nh *nh; }; static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_frl_arg *arg = _arg; arg->nh = nh; return find_match(nh, arg->flags, arg->oif, arg->strict, arg->mpri, arg->do_rr); } static void __find_rr_leaf(struct fib6_info *f6i_start, struct fib6_info *nomatch, u32 metric, struct fib6_result *res, struct fib6_info **cont, int oif, int strict, bool *do_rr, int *mpri) { struct fib6_info *f6i; for (f6i = f6i_start; f6i && f6i != nomatch; f6i = rcu_dereference(f6i->fib6_next)) { bool matched = false; struct fib6_nh *nh; if (cont && f6i->fib6_metric != metric) { *cont = f6i; return; } if (fib6_check_expired(f6i)) continue; if (unlikely(f6i->nh)) { struct fib6_nh_frl_arg arg = { .flags = f6i->fib6_flags, .oif = oif, .strict = strict, .mpri = mpri, .do_rr = do_rr }; if (nexthop_is_blackhole(f6i->nh)) { res->fib6_flags = RTF_REJECT; res->fib6_type = RTN_BLACKHOLE; res->f6i = f6i; res->nh = nexthop_fib6_nh(f6i->nh); return; } if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match, &arg)) { matched = true; nh = arg.nh; } } else { nh = f6i->fib6_nh; if (find_match(nh, f6i->fib6_flags, oif, strict, mpri, do_rr)) matched = true; } if (matched) { res->f6i = f6i; res->nh = nh; res->fib6_flags = f6i->fib6_flags; res->fib6_type = f6i->fib6_type; } } } static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf, struct fib6_info *rr_head, int oif, int strict, bool *do_rr, struct fib6_result *res) { u32 metric = rr_head->fib6_metric; struct fib6_info *cont = NULL; int mpri = -1; __find_rr_leaf(rr_head, NULL, metric, res, &cont, oif, strict, do_rr, &mpri); __find_rr_leaf(leaf, rr_head, metric, res, &cont, oif, strict, do_rr, &mpri); if (res->f6i || !cont) return; __find_rr_leaf(cont, NULL, metric, res, NULL, oif, strict, do_rr, &mpri); } static void rt6_select(struct net *net, struct fib6_node *fn, int oif, struct fib6_result *res, int strict) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct fib6_info *rt0; bool do_rr = false; int key_plen; /* make sure this function or its helpers sets f6i */ res->f6i = NULL; if (!leaf || leaf == net->ipv6.fib6_null_entry) goto out; rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) rt0 = leaf; /* Double check to make sure fn is not an intermediate node * and fn->leaf does not points to its child's leaf * (This might happen if all routes under fn are deleted from * the tree and fib6_repair_tree() is called on the node.) */ key_plen = rt0->fib6_dst.plen; #ifdef CONFIG_IPV6_SUBTREES if (rt0->fib6_src.plen) key_plen = rt0->fib6_src.plen; #endif if (fn->fn_bit != key_plen) goto out; find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res); if (do_rr) { struct fib6_info *next = rcu_dereference(rt0->fib6_next); /* no entries matched; do round-robin */ if (!next || next->fib6_metric != rt0->fib6_metric) next = leaf; if (next != rt0) { spin_lock_bh(&leaf->fib6_table->tb6_lock); /* make sure next is not being deleted from the tree */ if (next->fib6_node) rcu_assign_pointer(fn->rr_ptr, next); spin_unlock_bh(&leaf->fib6_table->tb6_lock); } } out: if (!res->f6i) { res->f6i = net->ipv6.fib6_null_entry; res->nh = res->f6i->fib6_nh; res->fib6_flags = res->f6i->fib6_flags; res->fib6_type = res->f6i->fib6_type; } } static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res) { return (res->f6i->fib6_flags & RTF_NONEXTHOP) || res->nh->fib_nh_gw_family; } #ifdef CONFIG_IPV6_ROUTE_INFO int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr) { struct net *net = dev_net(dev); struct route_info *rinfo = (struct route_info *) opt; struct in6_addr prefix_buf, *prefix; struct fib6_table *table; unsigned int pref; unsigned long lifetime; struct fib6_info *rt; if (len < sizeof(struct route_info)) { return -EINVAL; } /* Sanity check for prefix_len and length */ if (rinfo->length > 3) { return -EINVAL; } else if (rinfo->prefix_len > 128) { return -EINVAL; } else if (rinfo->prefix_len > 64) { if (rinfo->length < 2) { return -EINVAL; } } else if (rinfo->prefix_len > 0) { if (rinfo->length < 1) { return -EINVAL; } } pref = rinfo->route_pref; if (pref == ICMPV6_ROUTER_PREF_INVALID) return -EINVAL; lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); if (rinfo->length == 3) prefix = (struct in6_addr *)rinfo->prefix; else { /* this function is safe */ ipv6_addr_prefix(&prefix_buf, (struct in6_addr *)rinfo->prefix, rinfo->prefix_len); prefix = &prefix_buf; } if (rinfo->prefix_len == 0) rt = rt6_get_dflt_router(net, gwaddr, dev); else rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev); if (rt && !lifetime) { ip6_del_rt(net, rt, false); rt = NULL; } if (!rt && lifetime) rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev, pref); else if (rt) rt->fib6_flags = RTF_ROUTEINFO | (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); if (!addrconf_finite_timeout(lifetime)) { fib6_clean_expires(rt); fib6_remove_gc_list(rt); } else { fib6_set_expires(rt, jiffies + HZ * lifetime); fib6_add_gc_list(rt); } spin_unlock_bh(&table->tb6_lock); fib6_info_release(rt); } return 0; } #endif /* * Misc support functions */ /* called with rcu_lock held */ static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res) { struct net_device *dev = res->nh->fib_nh_dev; if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) { /* for copies of local routes, dst->dev needs to be the * device if it is a master device, the master device if * device is enslaved, and the loopback as the default */ if (netif_is_l3_slave(dev) && !rt6_need_strict(&res->f6i->fib6_dst.addr)) dev = l3mdev_master_dev_rcu(dev); else if (!netif_is_l3_master(dev)) dev = dev_net(dev)->loopback_dev; /* last case is netif_is_l3_master(dev) is true in which * case we want dev returned to be dev */ } return dev; } static const int fib6_prop[RTN_MAX + 1] = { [RTN_UNSPEC] = 0, [RTN_UNICAST] = 0, [RTN_LOCAL] = 0, [RTN_BROADCAST] = 0, [RTN_ANYCAST] = 0, [RTN_MULTICAST] = 0, [RTN_BLACKHOLE] = -EINVAL, [RTN_UNREACHABLE] = -EHOSTUNREACH, [RTN_PROHIBIT] = -EACCES, [RTN_THROW] = -EAGAIN, [RTN_NAT] = -EINVAL, [RTN_XRESOLVE] = -EINVAL, }; static int ip6_rt_type_to_error(u8 fib6_type) { return fib6_prop[fib6_type]; } static unsigned short fib6_info_dst_flags(struct fib6_info *rt) { unsigned short flags = 0; if (rt->dst_nocount) flags |= DST_NOCOUNT; if (rt->dst_nopolicy) flags |= DST_NOPOLICY; return flags; } static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type) { rt->dst.error = ip6_rt_type_to_error(fib6_type); switch (fib6_type) { case RTN_BLACKHOLE: rt->dst.output = dst_discard_out; rt->dst.input = dst_discard; break; case RTN_PROHIBIT: rt->dst.output = ip6_pkt_prohibit_out; rt->dst.input = ip6_pkt_prohibit; break; case RTN_THROW: case RTN_UNREACHABLE: default: rt->dst.output = ip6_pkt_discard_out; rt->dst.input = ip6_pkt_discard; break; } } static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res) { struct fib6_info *f6i = res->f6i; if (res->fib6_flags & RTF_REJECT) { ip6_rt_init_dst_reject(rt, res->fib6_type); return; } rt->dst.error = 0; rt->dst.output = ip6_output; if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) { rt->dst.input = ip6_input; } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) { rt->dst.input = ip6_mc_input; rt->dst.output = ip6_mr_output; } else { rt->dst.input = ip6_forward; } if (res->nh->fib_nh_lws) { rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws); lwtunnel_set_redirect(&rt->dst); } rt->dst.lastuse = jiffies; } /* Caller must already hold reference to @from */ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from) { rt->rt6i_flags &= ~RTF_EXPIRES; rcu_assign_pointer(rt->from, from); ip_dst_init_metrics(&rt->dst, from->fib6_metrics); } /* Caller must already hold reference to f6i in result */ static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res) { const struct fib6_nh *nh = res->nh; const struct net_device *dev = nh->fib_nh_dev; struct fib6_info *f6i = res->f6i; ip6_rt_init_dst(rt, res); rt->rt6i_dst = f6i->fib6_dst; rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL; rt->rt6i_flags = res->fib6_flags; if (nh->fib_nh_gw_family) { rt->rt6i_gateway = nh->fib_nh_gw6; rt->rt6i_flags |= RTF_GATEWAY; } rt6_set_from(rt, f6i); #ifdef CONFIG_IPV6_SUBTREES rt->rt6i_src = f6i->fib6_src; #endif } static struct fib6_node* fib6_backtrack(struct fib6_node *fn, struct in6_addr *saddr) { struct fib6_node *pn, *sn; while (1) { if (fn->fn_flags & RTN_TL_ROOT) return NULL; pn = rcu_dereference(fn->parent); sn = FIB6_SUBTREE(pn); if (sn && sn != fn) fn = fib6_node_lookup(sn, NULL, saddr); else fn = pn; if (fn->fn_flags & RTN_RTINFO) return fn; } } static bool ip6_hold_safe(struct net *net, struct rt6_info **prt) { struct rt6_info *rt = *prt; if (dst_hold_safe(&rt->dst)) return true; if (net) { rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); } else { rt = NULL; } *prt = rt; return false; } /* called with rcu_lock held */ static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res) { struct net_device *dev = res->nh->fib_nh_dev; struct fib6_info *f6i = res->f6i; unsigned short flags; struct rt6_info *nrt; if (!fib6_info_hold_safe(f6i)) goto fallback; flags = fib6_info_dst_flags(f6i); nrt = ip6_dst_alloc(dev_net(dev), dev, flags); if (!nrt) { fib6_info_release(f6i); goto fallback; } ip6_rt_copy_init(nrt, res); return nrt; fallback: nrt = dev_net(dev)->ipv6.ip6_null_entry; dst_hold(&nrt->dst); return nrt; } INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; struct fib6_node *fn; struct rt6_info *rt; rcu_read_lock(); fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: res.f6i = rcu_dereference(fn->leaf); if (!res.f6i) res.f6i = net->ipv6.fib6_null_entry; else rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif, flags); if (res.f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; rt = net->ipv6.ip6_null_entry; dst_hold(&rt->dst); goto out; } else if (res.fib6_flags & RTF_REJECT) { goto do_create; } fib6_select_path(net, &res, fl6, fl6->flowi6_oif, fl6->flowi6_oif != 0, skb, flags); /* Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { if (ip6_hold_safe(net, &rt)) dst_use_noref(&rt->dst, jiffies); } else { do_create: rt = ip6_create_rt_rcu(&res); } out: trace_fib6_table_lookup(net, &res, table, fl6); rcu_read_unlock(); return rt; } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup); } EXPORT_SYMBOL_GPL(ip6_route_lookup); struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, const struct sk_buff *skb, int strict) { struct flowi6 fl6 = { .flowi6_oif = oif, .daddr = *daddr, }; struct dst_entry *dst; int flags = strict ? RT6_LOOKUP_F_IFACE : 0; if (saddr) { memcpy(&fl6.saddr, saddr, sizeof(*saddr)); flags |= RT6_LOOKUP_F_HAS_SADDR; } dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup); if (dst->error == 0) return dst_rt6_info(dst); dst_release(dst); return NULL; } EXPORT_SYMBOL(rt6_lookup); /* ip6_ins_rt is called with FREE table->tb6_lock. * It takes new route entry, the addition fails by any reason the * route is released. * Caller must hold dst before calling it. */ static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { int err; struct fib6_table *table; table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); err = fib6_add(&table->tb6_root, rt, info, extack); spin_unlock_bh(&table->tb6_lock); return err; } int ip6_ins_rt(struct net *net, struct fib6_info *rt) { struct nl_info info = { .nl_net = net, }; return __ip6_ins_rt(rt, &info, NULL); } static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_info *f6i = res->f6i; struct net_device *dev; struct rt6_info *rt; /* * Clone the route. */ if (!fib6_info_hold_safe(f6i)) return NULL; dev = ip6_rt_get_dev_rcu(res); rt = ip6_dst_alloc(dev_net(dev), dev, 0); if (!rt) { fib6_info_release(f6i); return NULL; } ip6_rt_copy_init(rt, res); rt->rt6i_flags |= RTF_CACHE; rt->rt6i_dst.addr = *daddr; rt->rt6i_dst.plen = 128; if (!rt6_is_gw_or_nonexthop(res)) { if (f6i->fib6_dst.plen != 128 && ipv6_addr_equal(&f6i->fib6_dst.addr, daddr)) rt->rt6i_flags |= RTF_ANYCAST; #ifdef CONFIG_IPV6_SUBTREES if (rt->rt6i_src.plen && saddr) { rt->rt6i_src.addr = *saddr; rt->rt6i_src.plen = 128; } #endif } return rt; } static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) { struct fib6_info *f6i = res->f6i; unsigned short flags = fib6_info_dst_flags(f6i); struct net_device *dev; struct rt6_info *pcpu_rt; if (!fib6_info_hold_safe(f6i)) return NULL; rcu_read_lock(); dev = ip6_rt_get_dev_rcu(res); pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT); rcu_read_unlock(); if (!pcpu_rt) { fib6_info_release(f6i); return NULL; } ip6_rt_copy_init(pcpu_rt, res); pcpu_rt->rt6i_flags |= RTF_PCPU; if (f6i->nh) pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev)); return pcpu_rt; } static bool rt6_is_valid(const struct rt6_info *rt6) { return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev)); } /* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res) { struct rt6_info *pcpu_rt; pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu); if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) { struct rt6_info *prev, **p; p = this_cpu_ptr(res->nh->rt6i_pcpu); /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */ prev = xchg(p, NULL); if (prev) { dst_dev_put(&prev->dst); dst_release(&prev->dst); } pcpu_rt = NULL; } return pcpu_rt; } static struct rt6_info *rt6_make_pcpu_route(struct net *net, const struct fib6_result *res) { struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(res); if (!pcpu_rt) return NULL; p = this_cpu_ptr(res->nh->rt6i_pcpu); prev = cmpxchg(p, NULL, pcpu_rt); BUG_ON(prev); if (res->f6i->fib6_destroying) { struct fib6_info *from; from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } return pcpu_rt; } /* exception hash table implementation */ static DEFINE_SPINLOCK(rt6_exception_lock); /* Remove rt6_ex from hash table and free the memory * Caller must hold rt6_exception_lock */ static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { struct net *net; if (!bucket || !rt6_ex) return; net = dev_net(rt6_ex->rt6i->dst.dev); net->ipv6.rt6_stats->fib_rt_cache--; /* purge completely the exception to allow releasing the held resources: * some [sk] cache may keep the dst around for unlimited time */ dst_dev_put(&rt6_ex->rt6i->dst); hlist_del_rcu(&rt6_ex->hlist); dst_release(&rt6_ex->rt6i->dst); kfree_rcu(rt6_ex, rcu); WARN_ON_ONCE(!bucket->depth); bucket->depth--; } /* Remove oldest rt6_ex in bucket and free the memory * Caller must hold rt6_exception_lock */ static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) { struct rt6_exception *rt6_ex, *oldest = NULL; if (!bucket) return; hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) oldest = rt6_ex; } rt6_remove_exception(bucket, oldest); } static u32 rt6_exception_hash(const struct in6_addr *dst, const struct in6_addr *src) { static siphash_aligned_key_t rt6_exception_key; struct { struct in6_addr dst; struct in6_addr src; } __aligned(SIPHASH_ALIGNMENT) combined = { .dst = *dst, }; u64 val; net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key)); #ifdef CONFIG_IPV6_SUBTREES if (src) combined.src = *src; #endif val = siphash(&combined, sizeof(combined), &rt6_exception_key); return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); } /* Helper function to find the cached rt in the hash table * and update bucket pointer to point to the bucket for this * (daddr, saddr) pair * Caller must hold rt6_exception_lock */ static struct rt6_exception * __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct rt6_exception *rt6_ex; u32 hval; if (!(*bucket) || !daddr) return NULL; hval = rt6_exception_hash(daddr, saddr); *bucket += hval; hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { struct rt6_info *rt6 = rt6_ex->rt6i; bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); #ifdef CONFIG_IPV6_SUBTREES if (matched && saddr) matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); #endif if (matched) return rt6_ex; } return NULL; } /* Helper function to find the cached rt in the hash table * and update bucket pointer to point to the bucket for this * (daddr, saddr) pair * Caller must hold rcu_read_lock() */ static struct rt6_exception * __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct rt6_exception *rt6_ex; u32 hval; WARN_ON_ONCE(!rcu_read_lock_held()); if (!(*bucket) || !daddr) return NULL; hval = rt6_exception_hash(daddr, saddr); *bucket += hval; hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { struct rt6_info *rt6 = rt6_ex->rt6i; bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); #ifdef CONFIG_IPV6_SUBTREES if (matched && saddr) matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); #endif if (matched) return rt6_ex; } return NULL; } static unsigned int fib6_mtu(const struct fib6_result *res) { const struct fib6_nh *nh = res->nh; unsigned int mtu; if (res->f6i->fib6_pmtu) { mtu = res->f6i->fib6_pmtu; } else { struct net_device *dev = nh->fib_nh_dev; struct inet6_dev *idev; rcu_read_lock(); idev = __in6_dev_get(dev); mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); } mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } #define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL /* used when the flushed bit is not relevant, only access to the bucket * (ie., all bucket users except rt6_insert_exception); * * called under rcu lock; sometimes called with rt6_exception_lock held */ static struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh, spinlock_t *lock) { struct rt6_exception_bucket *bucket; if (lock) bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, lockdep_is_held(lock)); else bucket = rcu_dereference(nh->rt6i_exception_bucket); /* remove bucket flushed bit if set */ if (bucket) { unsigned long p = (unsigned long)bucket; p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED; bucket = (struct rt6_exception_bucket *)p; } return bucket; } static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket) { unsigned long p = (unsigned long)bucket; return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED); } /* called with rt6_exception_lock held */ static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh, spinlock_t *lock) { struct rt6_exception_bucket *bucket; unsigned long p; bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, lockdep_is_held(lock)); p = (unsigned long)bucket; p |= FIB6_EXCEPTION_BUCKET_FLUSHED; bucket = (struct rt6_exception_bucket *)p; rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); } static int rt6_insert_exception(struct rt6_info *nrt, const struct fib6_result *res) { struct net *net = dev_net(nrt->dst.dev); struct rt6_exception_bucket *bucket; struct fib6_info *f6i = res->f6i; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; struct fib6_nh *nh = res->nh; int max_depth; int err = 0; spin_lock_bh(&rt6_exception_lock); bucket = rcu_dereference_protected(nh->rt6i_exception_bucket, lockdep_is_held(&rt6_exception_lock)); if (!bucket) { bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), GFP_ATOMIC); if (!bucket) { err = -ENOMEM; goto out; } rcu_assign_pointer(nh->rt6i_exception_bucket, bucket); } else if (fib6_nh_excptn_bucket_flushed(bucket)) { err = -EINVAL; goto out; } #ifdef CONFIG_IPV6_SUBTREES /* fib6_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of * both fib6_dst and fib6_src. * Otherwise, the exception table is indexed by * a hash of only fib6_dst. */ if (f6i->fib6_src.plen) src_key = &nrt->rt6i_src.addr; #endif /* rt6_mtu_change() might lower mtu on f6i. * Only insert this exception route if its mtu * is less than f6i's mtu value. */ if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) { err = -EINVAL; goto out; } rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_remove_exception(bucket, rt6_ex); rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); if (!rt6_ex) { err = -ENOMEM; goto out; } rt6_ex->rt6i = nrt; rt6_ex->stamp = jiffies; hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); bucket->depth++; net->ipv6.rt6_stats->fib_rt_cache++; /* Randomize max depth to avoid some side channels attacks. */ max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH); while (bucket->depth > max_depth) rt6_exception_remove_oldest(bucket); out: spin_unlock_bh(&rt6_exception_lock); /* Update fn->fn_sernum to invalidate all cached dst */ if (!err) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum(net, f6i); fib6_add_gc_list(f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); fib6_force_start_gc(net); } return err; } static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; spin_lock_bh(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) goto out; /* Prevent rt6_insert_exception() to recreate the bucket list */ if (!from) fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock); for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { if (!from || rcu_access_pointer(rt6_ex->rt6i->from) == from) rt6_remove_exception(bucket, rt6_ex); } WARN_ON_ONCE(!from && bucket->depth); bucket++; } out: spin_unlock_bh(&rt6_exception_lock); } static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) { struct fib6_info *f6i = arg; fib6_nh_flush_exceptions(nh, f6i); return 0; } void rt6_flush_exceptions(struct fib6_info *f6i) { if (f6i->nh) { rcu_read_lock(); nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i); rcu_read_unlock(); } else { fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); } } /* Find cached rt in the hash table inside passed in rt * Caller has to hold rcu_read_lock() */ static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct rt6_info *ret = NULL; #ifdef CONFIG_IPV6_SUBTREES /* fib6i_src.plen != 0 indicates f6i is in subtree * and exception table is indexed by a hash of * both fib6_dst and fib6_src. * However, the src addr used to create the hash * might not be exactly the passed in saddr which * is a /128 addr from the flow. * So we need to use f6i->fib6_src to redo lookup * if the passed in saddr does not find anything. * (See the logic in ip6_rt_cache_alloc() on how * rt->rt6i_src is updated.) */ if (res->f6i->fib6_src.plen) src_key = saddr; find_ex: #endif bucket = fib6_nh_get_excptn_bucket(res->nh, NULL); rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) ret = rt6_ex->rt6i; #ifdef CONFIG_IPV6_SUBTREES /* Use fib6_src as src_key and redo lookup */ if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) { src_key = &res->f6i->fib6_src.addr; goto find_ex; } #endif return ret; } /* Remove the passed in cached rt from the hash table that contains it */ static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen, const struct rt6_info *rt) { const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int err; if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return -ENOENT; spin_lock_bh(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of * both rt6i_dst and rt6i_src. * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_spinlock(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) { rt6_remove_exception(bucket, rt6_ex); err = 0; } else { err = -ENOENT; } spin_unlock_bh(&rt6_exception_lock); return err; } struct fib6_nh_excptn_arg { struct rt6_info *rt; int plen; }; static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg) { struct fib6_nh_excptn_arg *arg = _arg; int err; err = fib6_nh_remove_exception(nh, arg->plen, arg->rt); if (err == 0) return 1; return 0; } static int rt6_remove_exception_rt(struct rt6_info *rt) { struct fib6_info *from; from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) return -EINVAL; if (from->nh) { struct fib6_nh_excptn_arg arg = { .rt = rt, .plen = from->fib6_src.plen }; int rc; /* rc = 1 means an entry was found */ rc = nexthop_for_each_fib6_nh(from->nh, rt6_nh_remove_exception_rt, &arg); return rc ? 0 : -ENOENT; } return fib6_nh_remove_exception(from->fib6_nh, from->fib6_src.plen, rt); } /* Find rt6_ex which contains the passed in rt cache and * refresh its stamp */ static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen, const struct rt6_info *rt) { const struct in6_addr *src_key = NULL; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; bucket = fib6_nh_get_excptn_bucket(nh, NULL); #ifdef CONFIG_IPV6_SUBTREES /* rt6i_src.plen != 0 indicates 'from' is in subtree * and exception table is indexed by a hash of * both rt6i_dst and rt6i_src. * Otherwise, the exception table is indexed by * a hash of only rt6i_dst. */ if (plen) src_key = &rt->rt6i_src.addr; #endif rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key); if (rt6_ex) rt6_ex->stamp = jiffies; } struct fib6_nh_match_arg { const struct net_device *dev; const struct in6_addr *gw; struct fib6_nh *match; }; /* determine if fib6_nh has given device and gateway */ static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_match_arg *arg = _arg; if (arg->dev != nh->fib_nh_dev || (arg->gw && !nh->fib_nh_gw_family) || (!arg->gw && nh->fib_nh_gw_family) || (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6))) return 0; arg->match = nh; /* found a match, break the loop */ return 1; } static void rt6_update_exception_stamp_rt(struct rt6_info *rt) { struct fib6_info *from; struct fib6_nh *fib6_nh; rcu_read_lock(); from = rcu_dereference(rt->from); if (!from || !(rt->rt6i_flags & RTF_CACHE)) goto unlock; if (from->nh) { struct fib6_nh_match_arg arg = { .dev = rt->dst.dev, .gw = &rt->rt6i_gateway, }; nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg); if (!arg.match) goto unlock; fib6_nh = arg.match; } else { fib6_nh = from->fib6_nh; } fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt); unlock: rcu_read_unlock(); } static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, struct rt6_info *rt, int mtu) { /* If the new MTU is lower than the route PMTU, this new MTU will be the * lowest MTU in the path: always allow updating the route PMTU to * reflect PMTU decreases. * * If the new MTU is higher, and the route PMTU is equal to the local * MTU, this means the old MTU is the lowest in the path, so allow * updating it: if other nodes now have lower MTUs, PMTU discovery will * handle this. */ if (dst_mtu(&rt->dst) >= mtu) return true; if (dst_mtu(&rt->dst) == idev->cnf.mtu6) return true; return false; } static void rt6_exceptions_update_pmtu(struct inet6_dev *idev, const struct fib6_nh *nh, int mtu) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i; bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (!bucket) return; for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { struct rt6_info *entry = rt6_ex->rt6i; /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected * route), the metrics of its rt->from have already * been updated. */ if (dst_metric_raw(&entry->dst, RTAX_MTU) && rt6_mtu_change_route_allowed(idev, entry, mtu)) dst_metric_set(&entry->dst, RTAX_MTU, mtu); } bucket++; } } #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh, const struct in6_addr *gateway) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; spin_lock_bh(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { struct rt6_info *entry = rt6_ex->rt6i; if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY && ipv6_addr_equal(gateway, &entry->rt6i_gateway)) { rt6_remove_exception(bucket, rt6_ex); } } bucket++; } } spin_unlock_bh(&rt6_exception_lock); } static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex, struct fib6_gc_args *gc_args, unsigned long now) { struct rt6_info *rt = rt6_ex->rt6i; /* we are pruning and obsoleting aged-out and non gateway exceptions * even if others have still references to them, so that on next * dst_check() such references can be dropped. * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when * expired, independently from their aging, as per RFC 8201 section 4 */ if (!(rt->rt6i_flags & RTF_EXPIRES)) { if (time_after_eq(now, READ_ONCE(rt->dst.lastuse) + gc_args->timeout)) { pr_debug("aging clone %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } } else if (time_after(jiffies, READ_ONCE(rt->dst.expires))) { pr_debug("purging expired route %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } if (rt->rt6i_flags & RTF_GATEWAY) { struct neighbour *neigh; neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); if (!(neigh && (neigh->flags & NTF_ROUTER))) { pr_debug("purging route %p via non-router but gateway\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } } gc_args->more++; } static void fib6_nh_age_exceptions(const struct fib6_nh *nh, struct fib6_gc_args *gc_args, unsigned long now) { struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; struct hlist_node *tmp; int i; if (!rcu_access_pointer(nh->rt6i_exception_bucket)) return; rcu_read_lock_bh(); spin_lock(&rt6_exception_lock); bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock); if (bucket) { for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) { rt6_age_examine_exception(bucket, rt6_ex, gc_args, now); } bucket++; } } spin_unlock(&rt6_exception_lock); rcu_read_unlock_bh(); } struct fib6_nh_age_excptn_arg { struct fib6_gc_args *gc_args; unsigned long now; }; static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg) { struct fib6_nh_age_excptn_arg *arg = _arg; fib6_nh_age_exceptions(nh, arg->gc_args, arg->now); return 0; } void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, unsigned long now) { if (f6i->nh) { struct fib6_nh_age_excptn_arg arg = { .gc_args = gc_args, .now = now }; nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions, &arg); } else { fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now); } } /* must be called with rcu lock held */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict) { struct fib6_node *fn, *saved_fn; fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; redo_rt6_select: rt6_select(net, fn, oif, res, strict); if (res->f6i == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto redo_rt6_select; else if (strict & RT6_LOOKUP_F_REACHABLE) { /* also consider unreachable route */ strict &= ~RT6_LOOKUP_F_REACHABLE; fn = saved_fn; goto redo_rt6_select; } } trace_fib6_table_lookup(net, res, table, fl6); return 0; } struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { struct fib6_result res = {}; struct rt6_info *rt = NULL; int strict = 0; WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) && !rcu_read_lock_held()); strict |= flags & RT6_LOOKUP_F_IFACE; strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE; if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) strict |= RT6_LOOKUP_F_REACHABLE; rcu_read_lock(); fib6_table_lookup(net, table, oif, fl6, &res, strict); if (res.f6i == net->ipv6.fib6_null_entry) goto out; fib6_select_path(net, &res, fl6, oif, false, skb, strict); /*Search through exception table */ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr); if (rt) { goto out; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && !res.nh->fib_nh_gw_family)) { /* Create a RTF_CACHE clone which will not be * owned by the fib6 tree. It is for the special case where * the daddr in the skb during the neighbor look-up is different * from the fl6->daddr used to look-up route here. */ rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL); if (rt) { /* 1 refcnt is taken during ip6_rt_cache_alloc(). * As rt6_uncached_list_add() does not consume refcnt, * this refcnt is always returned to the caller even * if caller sets RT6_LOOKUP_F_DST_NOREF flag. */ rt6_uncached_list_add(rt); rcu_read_unlock(); return rt; } } else { /* Get a percpu copy */ local_bh_disable(); rt = rt6_get_pcpu_route(&res); if (!rt) rt = rt6_make_pcpu_route(net, &res); local_bh_enable(); } out: if (!rt) rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) ip6_hold_safe(net, &rt); rcu_read_unlock(); return rt; } EXPORT_SYMBOL_GPL(ip6_pol_route); INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags); } struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG) flags |= RT6_LOOKUP_F_IFACE; return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input); } EXPORT_SYMBOL_GPL(ip6_route_input_lookup); static void ip6_multipath_l3_keys(const struct sk_buff *skb, struct flow_keys *keys, struct flow_keys *flkeys) { const struct ipv6hdr *outer_iph = ipv6_hdr(skb); const struct ipv6hdr *key_iph = outer_iph; struct flow_keys *_flkeys = flkeys; const struct ipv6hdr *inner_iph; const struct icmp6hdr *icmph; struct ipv6hdr _inner_iph; struct icmp6hdr _icmph; if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6)) goto out; icmph = skb_header_pointer(skb, skb_transport_offset(skb), sizeof(_icmph), &_icmph); if (!icmph) goto out; if (!icmpv6_is_err(icmph->icmp6_type)) goto out; inner_iph = skb_header_pointer(skb, skb_transport_offset(skb) + sizeof(*icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) goto out; key_iph = inner_iph; _flkeys = NULL; out: if (_flkeys) { keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src; keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst; keys->tags.flow_label = _flkeys->tags.flow_label; keys->basic.ip_proto = _flkeys->basic.ip_proto; } else { keys->addrs.v6addrs.src = key_iph->saddr; keys->addrs.v6addrs.dst = key_iph->daddr; keys->tags.flow_label = ip6_flowlabel(key_iph); keys->basic.ip_proto = key_iph->nexthdr; } } static u32 rt6_multipath_custom_hash_outer(const struct net *net, const struct sk_buff *skb, bool *p_has_inner) { u32 hash_fields = ip6_multipath_hash_fields(net); struct flow_keys keys, hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = keys.tags.flow_label; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 rt6_multipath_custom_hash_inner(const struct net *net, const struct sk_buff *skb, bool has_inner) { u32 hash_fields = ip6_multipath_hash_fields(net); struct flow_keys keys, hash_keys; /* We assume the packet carries an encapsulation, but if none was * encountered during dissection of the outer flow, then there is no * point in calling the flow dissector again. */ if (!has_inner) return 0; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, 0); if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) return 0; if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) hash_keys.tags.flow_label = keys.tags.flow_label; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 rt6_multipath_custom_hash_skb(const struct net *net, const struct sk_buff *skb) { u32 mhash, mhash_inner; bool has_inner = true; mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner); mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner); return jhash_2words(mhash, mhash_inner, 0); } static u32 rt6_multipath_custom_hash_fl6(const struct net *net, const struct flowi6 *fl6) { u32 hash_fields = ip6_multipath_hash_fields(net); struct flow_keys hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v6addrs.src = fl6->saddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v6addrs.dst = fl6->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl6->flowi6_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) hash_keys.ports.src = (__force __be16)get_random_u16(); else hash_keys.ports.src = fl6->fl6_sport; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl6->fl6_dport; return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl6 can be NULL */ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *flkeys) { struct flow_keys hash_keys; u32 mhash = 0; switch (ip6_multipath_hash_policy(net)) { case 0: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (skb) { ip6_multipath_l3_keys(skb, &hash_keys, flkeys); } else { hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: if (skb) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; struct flow_keys keys; /* short-circuit if we already have L4 hash present */ if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; memset(&hash_keys, 0, sizeof(hash_keys)); if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); flkeys = &keys; } hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; hash_keys.ports.src = flkeys->ports.src; hash_keys.ports.dst = flkeys->ports.dst; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) hash_keys.ports.src = (__force __be16)get_random_u16(); else hash_keys.ports.src = fl6->fl6_sport; hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (skb) { struct flow_keys keys; if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, 0); flkeys = &keys; } /* Inner can be v4 or v6 */ if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src; hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst; hash_keys.tags.flow_label = flkeys->tags.flow_label; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; ip6_multipath_l3_keys(skb, &hash_keys, flkeys); } } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); hash_keys.basic.ip_proto = fl6->flowi6_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) mhash = rt6_multipath_custom_hash_skb(net, skb); else mhash = rt6_multipath_custom_hash_fl6(net, fl6); break; } return mhash >> 1; } /* Called with rcu held */ void ip6_route_input(struct sk_buff *skb) { const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF; struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), .flowi6_mark = skb->mark, .flowi6_proto = iph->nexthdr, }; struct flow_keys *flkeys = NULL, _flkeys; tun_info = skb_tunnel_info(skb); if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys)) flkeys = &_flkeys; if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6)) fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys); skb_dst_drop(skb); skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags)); } INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags); } static struct dst_entry *ip6_route_output_flags_noref(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags) { bool any_src; if (ipv6_addr_type(&fl6->daddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) { struct dst_entry *dst; /* This function does not take refcnt on the dst */ dst = l3mdev_link_scope_lookup(net, fl6); if (dst) return dst; } fl6->flowi6_iif = LOOPBACK_IFINDEX; flags |= RT6_LOOKUP_F_DST_NOREF; any_src = ipv6_addr_any(&fl6->saddr); if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || (fl6->flowi6_oif && any_src)) flags |= RT6_LOOKUP_F_IFACE; if (!any_src) flags |= RT6_LOOKUP_F_HAS_SADDR; else if (sk) flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs)); return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output); } struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags) { struct dst_entry *dst; struct rt6_info *rt6; rcu_read_lock(); dst = ip6_route_output_flags_noref(net, sk, fl6, flags); rt6 = dst_rt6_info(dst); /* For dst cached in uncached_list, refcnt is already taken. */ if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) { dst = &net->ipv6.ip6_null_entry->dst; dst_hold(dst); } rcu_read_unlock(); return dst; } EXPORT_SYMBOL_GPL(ip6_route_output_flags); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig) { struct rt6_info *rt, *ort = dst_rt6_info(dst_orig); struct net_device *loopback_dev = net->loopback_dev; struct dst_entry *new = NULL; rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, DST_OBSOLETE_DEAD, 0); if (rt) { rt6_info_init(rt); atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); new = &rt->dst; new->__use = 1; new->input = dst_discard; new->output = dst_discard_out; dst_copy_metrics(new, &ort->dst); rt->rt6i_idev = in6_dev_get(loopback_dev); rt->rt6i_gateway = ort->rt6i_gateway; rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU; memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); #ifdef CONFIG_IPV6_SUBTREES memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); #endif } dst_release(dst_orig); return new ? new : ERR_PTR(-ENOMEM); } /* * Destination cache support functions */ static bool fib6_check(struct fib6_info *f6i, u32 cookie) { u32 rt_cookie = 0; if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie) return false; if (fib6_check_expired(f6i)) return false; return true; } static struct dst_entry *rt6_check(struct rt6_info *rt, struct fib6_info *from, u32 cookie) { u32 rt_cookie = 0; if (!from || !fib6_get_cookie_safe(from, &rt_cookie) || rt_cookie != cookie) return NULL; if (rt6_check_expired(rt)) return NULL; return &rt->dst; } static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, struct fib6_info *from, u32 cookie) { if (!__rt6_check_expired(rt) && READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK && fib6_check(from, cookie)) return &rt->dst; return NULL; } INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) { struct dst_entry *dst_ret; struct fib6_info *from; struct rt6_info *rt; rt = dst_rt6_info(dst); if (rt->sernum) return rt6_is_valid(rt) ? dst : NULL; rcu_read_lock(); /* All IPV6 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. */ from = rcu_dereference(rt->from); if (from && (rt->rt6i_flags & RTF_PCPU || unlikely(!list_empty(&rt->dst.rt_uncached)))) dst_ret = rt6_dst_from_check(rt, from, cookie); else dst_ret = rt6_check(rt, from, cookie); rcu_read_unlock(); return dst_ret; } EXPORT_INDIRECT_CALLABLE(ip6_dst_check); static void ip6_negative_advice(struct sock *sk, struct dst_entry *dst) { struct rt6_info *rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_CACHE) { rcu_read_lock(); if (rt6_check_expired(rt)) { /* rt/dst can not be destroyed yet, * because of rcu_read_lock() */ sk_dst_reset(sk); rt6_remove_exception_rt(rt); } rcu_read_unlock(); return; } sk_dst_reset(sk); } static void ip6_link_failure(struct sk_buff *skb) { struct rt6_info *rt; icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0); rt = dst_rt6_info(skb_dst(skb)); if (rt) { rcu_read_lock(); if (rt->rt6i_flags & RTF_CACHE) { rt6_remove_exception_rt(rt); } else { struct fib6_info *from; struct fib6_node *fn; from = rcu_dereference(rt->from); if (from) { fn = rcu_dereference(from->fib6_node); if (fn && (rt->rt6i_flags & RTF_DEFAULT)) WRITE_ONCE(fn->fn_sernum, -1); } } rcu_read_unlock(); } } static void rt6_update_expires(struct rt6_info *rt0, int timeout) { if (!(rt0->rt6i_flags & RTF_EXPIRES)) { struct fib6_info *from; rcu_read_lock(); from = rcu_dereference(rt0->from); if (from) WRITE_ONCE(rt0->dst.expires, from->expires); rcu_read_unlock(); } dst_set_expires(&rt0->dst, timeout); rt0->rt6i_flags |= RTF_EXPIRES; } static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) { struct net *net = dev_net(rt->dst.dev); dst_metric_set(&rt->dst, RTAX_MTU, mtu); rt->rt6i_flags |= RTF_MODIFIED; rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); } static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) { return !(rt->rt6i_flags & RTF_CACHE) && (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from)); } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, const struct ipv6hdr *iph, u32 mtu, bool confirm_neigh) { const struct in6_addr *daddr, *saddr; struct rt6_info *rt6 = dst_rt6_info(dst); /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. * [see also comment in rt6_mtu_change_route()] */ if (iph) { daddr = &iph->daddr; saddr = &iph->saddr; } else if (sk) { daddr = &sk->sk_v6_daddr; saddr = &inet6_sk(sk)->saddr; } else { daddr = NULL; saddr = NULL; } if (confirm_neigh) dst_confirm_neigh(dst, daddr); if (mtu < IPV6_MIN_MTU) return; if (mtu >= dst_mtu(dst)) return; if (!rt6_cache_allowed_for_pmtu(rt6)) { rt6_do_update_pmtu(rt6, mtu); /* update rt6_ex->stamp for cache */ if (rt6->rt6i_flags & RTF_CACHE) rt6_update_exception_stamp_rt(rt6); } else if (daddr) { struct fib6_result res = {}; struct rt6_info *nrt6; rcu_read_lock(); res.f6i = rcu_dereference(rt6->from); if (!res.f6i) goto out_unlock; res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; if (res.f6i->nh) { struct fib6_nh_match_arg arg = { .dev = dst_dev_rcu(dst), .gw = &rt6->rt6i_gateway, }; nexthop_for_each_fib6_nh(res.f6i->nh, fib6_nh_find_match, &arg); /* fib6_info uses a nexthop that does not have fib6_nh * using the dst->dev + gw. Should be impossible. */ if (!arg.match) goto out_unlock; res.nh = arg.match; } else { res.nh = res.f6i->fib6_nh; } nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); if (rt6_insert_exception(nrt6, &res)) dst_release_immediate(&nrt6->dst); } out_unlock: rcu_read_unlock(); } } static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu, confirm_neigh); } void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, u32 mark, kuid_t uid) { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_oif = oif, .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark), .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), .flowi6_uid = uid, }; dst = ip6_route_output(net, NULL, &fl6); if (!dst->error) __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_update_pmtu); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) { int oif = sk->sk_bound_dev_if; struct dst_entry *dst; if (!oif && skb->dev) oif = l3mdev_master_ifindex(skb->dev); ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark), sk_uid(sk)); dst = __sk_dst_get(sk); if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) return; bh_lock_sock(sk); if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) ip6_datagram_dst_update(sk, false); bh_unlock_sock(sk); } EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu); void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, const struct flowi6 *fl6) { #ifdef CONFIG_IPV6_SUBTREES struct ipv6_pinfo *np = inet6_sk(sk); #endif ip6_dst_store(sk, dst, ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr), #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? true : #endif false); } static bool ip6_redirect_nh_match(const struct fib6_result *res, struct flowi6 *fl6, const struct in6_addr *gw, struct rt6_info **ret) { const struct fib6_nh *nh = res->nh; if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family || fl6->flowi6_oif != nh->fib_nh_dev->ifindex) return false; /* rt_cache's gateway might be different from its 'parent' * in the case of an ip redirect. * So we keep searching in the exception table if the gateway * is different. */ if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) { struct rt6_info *rt_cache; rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr); if (rt_cache && ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) { *ret = rt_cache; return true; } return false; } return true; } struct fib6_nh_rd_arg { struct fib6_result *res; struct flowi6 *fl6; const struct in6_addr *gw; struct rt6_info **ret; }; static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg) { struct fib6_nh_rd_arg *arg = _arg; arg->res->nh = nh; return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret); } /* Handle redirects */ struct ip6rd_flowi { struct flowi6 fl6; struct in6_addr gateway; }; INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; struct rt6_info *ret = NULL; struct fib6_result res = {}; struct fib6_nh_rd_arg arg = { .res = &res, .fl6 = fl6, .gw = &rdfl->gateway, .ret = &ret }; struct fib6_info *rt; struct fib6_node *fn; /* Get the "current" route for this destination and * check if the redirect has come from appropriate router. * * RFC 4861 specifies that redirects should only be * accepted if they come from the nexthop to the target. * Due to the way the routes are chosen, this notion * is a bit fuzzy and one might need to check all possible * routes. */ rcu_read_lock(); fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: for_each_fib6_node_rt_rcu(fn) { res.f6i = rt; if (fib6_check_expired(rt)) continue; if (rt->fib6_flags & RTF_REJECT) break; if (unlikely(rt->nh)) { if (nexthop_is_blackhole(rt->nh)) continue; /* on match, res->nh is filled in and potentially ret */ if (nexthop_for_each_fib6_nh(rt->nh, fib6_nh_redirect_match, &arg)) goto out; } else { res.nh = rt->fib6_nh; if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway, &ret)) goto out; } } if (!rt) rt = net->ipv6.fib6_null_entry; else if (rt->fib6_flags & RTF_REJECT) { ret = net->ipv6.ip6_null_entry; goto out; } if (rt == net->ipv6.fib6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; } res.f6i = rt; res.nh = rt->fib6_nh; out: if (ret) { ip6_hold_safe(net, &ret); } else { res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; ret = ip6_create_rt_rcu(&res); } rcu_read_unlock(); trace_fib6_table_lookup(net, &res, table, fl6); return ret; }; static struct dst_entry *ip6_route_redirect(struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, const struct in6_addr *gateway) { int flags = RT6_LOOKUP_F_HAS_SADDR; struct ip6rd_flowi rdfl; rdfl.fl6 = *fl6; rdfl.gateway = *gateway; return fib6_rule_lookup(net, &rdfl.fl6, skb, flags, __ip6_route_redirect); } void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, kuid_t uid) { const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data; struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_oif = oif, .flowi6_mark = mark, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), .flowi6_uid = uid, }; dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } EXPORT_SYMBOL_GPL(ip6_redirect); void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) { const struct ipv6hdr *iph = ipv6_hdr(skb); const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb); struct dst_entry *dst; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_oif = oif, .daddr = msg->dest, .saddr = iph->daddr, .flowi6_uid = sock_net_uid(net, NULL), }; dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr); rt6_do_redirect(dst, NULL, skb); dst_release(dst); } void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) { ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), sk_uid(sk)); } EXPORT_SYMBOL_GPL(ip6_sk_redirect); static unsigned int ip6_default_advmss(const struct dst_entry *dst) { unsigned int mtu = dst_mtu(dst); struct net *net; mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); rcu_read_lock(); net = dst_dev_net_rcu(dst); if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) mtu = net->ipv6.sysctl.ip6_rt_min_advmss; rcu_read_unlock(); /* * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. * IPV6_MAXPLEN is also valid and means: "any MSS, * rely only on pmtu discovery" */ if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) mtu = IPV6_MAXPLEN; return mtu; } INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst) { return ip6_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ip6_mtu); /* MTU selection: * 1. mtu on route is locked - use it * 2. mtu from nexthop exception * 3. mtu from egress device * * based on ip6_dst_mtu_forward and exception logic of * rt6_find_cached_rt; called with rcu_read_lock */ u32 ip6_mtu_from_fib6(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr) { const struct fib6_nh *nh = res->nh; struct fib6_info *f6i = res->f6i; struct inet6_dev *idev; struct rt6_info *rt; u32 mtu = 0; if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) { mtu = f6i->fib6_pmtu; if (mtu) goto out; } rt = rt6_find_cached_rt(res, daddr, saddr); if (unlikely(rt)) { mtu = dst_metric_raw(&rt->dst, RTAX_MTU); } else { struct net_device *dev = nh->fib_nh_dev; mtu = IPV6_MIN_MTU; idev = __in6_dev_get(dev); if (idev) mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6)); } mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); out: return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu); } struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6) { struct dst_entry *dst; struct rt6_info *rt; struct inet6_dev *idev = in6_dev_get(dev); struct net *net = dev_net(dev); if (unlikely(!idev)) return ERR_PTR(-ENODEV); rt = ip6_dst_alloc(net, dev, 0); if (unlikely(!rt)) { in6_dev_put(idev); dst = ERR_PTR(-ENOMEM); goto out; } rt->dst.input = ip6_input; rt->dst.output = ip6_output; rt->rt6i_gateway = fl6->daddr; rt->rt6i_dst.addr = fl6->daddr; rt->rt6i_dst.plen = 128; rt->rt6i_idev = idev; dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0); /* Add this dst into uncached_list so that rt6_disable_ip() can * do proper release of the net_device */ rt6_uncached_list_add(rt); dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); out: return dst; } static void ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; unsigned int val; int entries; if (time_after(rt_last_gc + rt_min_interval, jiffies)) goto out; fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true); entries = dst_entries_get_slow(ops); if (entries < ops->gc_thresh) atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1); out: val = atomic_read(&net->ipv6.ip6_rt_gc_expire); atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity)); } static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg, const struct in6_addr *gw_addr, u32 tbid, int flags, struct fib6_result *res) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, .daddr = *gw_addr, .saddr = cfg->fc_prefsrc, }; struct fib6_table *table; int err; table = fib6_get_table(net, tbid); if (!table) return -EINVAL; if (!ipv6_addr_any(&cfg->fc_prefsrc)) flags |= RT6_LOOKUP_F_HAS_SADDR; flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE; err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags); if (!err && res->f6i != net->ipv6.fib6_null_entry) fib6_select_path(net, res, &fl6, cfg->fc_ifindex, cfg->fc_ifindex != 0, NULL, flags); return err; } static int ip6_route_check_nh_onlink(struct net *net, struct fib6_config *cfg, const struct net_device *dev, struct netlink_ext_ack *extack) { u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN; const struct in6_addr *gw_addr = &cfg->fc_gateway; struct fib6_result res = {}; int err; err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); if (!err && !(res.fib6_flags & RTF_REJECT) && /* ignore match if it is the default route */ !ipv6_addr_any(&res.f6i->fib6_dst.addr) && (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway or device mismatch"); err = -EINVAL; } return err; } static int ip6_route_check_nh(struct net *net, struct fib6_config *cfg, struct net_device **_dev, netdevice_tracker *dev_tracker, struct inet6_dev **idev) { const struct in6_addr *gw_addr = &cfg->fc_gateway; struct net_device *dev = _dev ? *_dev : NULL; int flags = RT6_LOOKUP_F_IFACE; struct fib6_result res = {}; int err = -EHOSTUNREACH; if (cfg->fc_table) { err = ip6_nh_lookup_table(net, cfg, gw_addr, cfg->fc_table, flags, &res); /* gw_addr can not require a gateway or resolve to a reject * route. If a device is given, it must match the result. */ if (err || res.fib6_flags & RTF_REJECT || res.nh->fib_nh_gw_family || (dev && dev != res.nh->fib_nh_dev)) err = -EHOSTUNREACH; } if (err < 0) { struct flowi6 fl6 = { .flowi6_oif = cfg->fc_ifindex, .daddr = *gw_addr, }; err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags); if (err || res.fib6_flags & RTF_REJECT || res.nh->fib_nh_gw_family) err = -EHOSTUNREACH; if (err) return err; fib6_select_path(net, &res, &fl6, cfg->fc_ifindex, cfg->fc_ifindex != 0, NULL, flags); } err = 0; if (dev) { if (dev != res.nh->fib_nh_dev) err = -EHOSTUNREACH; } else { *_dev = dev = res.nh->fib_nh_dev; netdev_hold(dev, dev_tracker, GFP_ATOMIC); *idev = in6_dev_get(dev); } return err; } static int ip6_validate_gw(struct net *net, struct fib6_config *cfg, struct net_device **_dev, netdevice_tracker *dev_tracker, struct inet6_dev **idev, struct netlink_ext_ack *extack) { const struct in6_addr *gw_addr = &cfg->fc_gateway; int gwa_type = ipv6_addr_type(gw_addr); bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true; const struct net_device *dev = *_dev; bool need_addr_check = !dev; int err = -EINVAL; /* if gw_addr is local we will fail to detect this in case * address is still TENTATIVE (DAD in progress). rt6_lookup() * will return already-added prefix route via interface that * prefix route was assigned to, which might be non-loopback. */ if (dev && ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); goto out; } if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) { /* IPv6 strictly inhibits using not link-local * addresses as nexthop address. * Otherwise, router will not able to send redirects. * It is very good, but in some (rare!) circumstances * (SIT, PtP, NBMA NOARP links) it is handy to allow * some exceptions. --ANK * We allow IPv4-mapped nexthops to support RFC4798-type * addressing */ if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) { NL_SET_ERR_MSG(extack, "Invalid gateway address"); goto out; } rcu_read_lock(); if (cfg->fc_flags & RTNH_F_ONLINK) err = ip6_route_check_nh_onlink(net, cfg, dev, extack); else err = ip6_route_check_nh(net, cfg, _dev, dev_tracker, idev); rcu_read_unlock(); if (err) goto out; } /* reload in case device was changed */ dev = *_dev; err = -EINVAL; if (!dev) { NL_SET_ERR_MSG(extack, "Egress device not specified"); goto out; } else if (dev->flags & IFF_LOOPBACK) { NL_SET_ERR_MSG(extack, "Egress device can not be loopback device for this route"); goto out; } /* if we did not check gw_addr above, do so now that the * egress device has been resolved. */ if (need_addr_check && ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) { NL_SET_ERR_MSG(extack, "Gateway can not be a local address"); goto out; } err = 0; out: return err; } static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type) { if ((flags & RTF_REJECT) || (dev && (dev->flags & IFF_LOOPBACK) && !(addr_type & IPV6_ADDR_LOOPBACK) && !(flags & (RTF_ANYCAST | RTF_LOCAL)))) return true; return false; } int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker; struct net_device *dev = NULL; struct inet6_dev *idev = NULL; int addr_type; int err; fib6_nh->fib_nh_family = AF_INET6; #ifdef CONFIG_IPV6_ROUTER_PREF fib6_nh->last_probe = jiffies; #endif if (cfg->fc_is_fdb) { fib6_nh->fib_nh_gw6 = cfg->fc_gateway; fib6_nh->fib_nh_gw_family = AF_INET6; return 0; } err = -ENODEV; if (cfg->fc_ifindex) { dev = netdev_get_by_index(net, cfg->fc_ifindex, dev_tracker, gfp_flags); if (!dev) goto out; idev = in6_dev_get(dev); if (!idev) goto out; } if (cfg->fc_flags & RTNH_F_ONLINK) { if (!dev) { NL_SET_ERR_MSG(extack, "Nexthop device required for onlink"); goto out; } if (!(dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; goto out; } fib6_nh->fib_nh_flags |= RTNH_F_ONLINK; } fib6_nh->fib_nh_weight = 1; /* We cannot add true routes via loopback here, * they would result in kernel looping; promote them to reject routes */ addr_type = ipv6_addr_type(&cfg->fc_dst); if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) { /* hold loopback dev/idev if we haven't done so. */ if (dev != net->loopback_dev) { if (dev) { netdev_put(dev, dev_tracker); in6_dev_put(idev); } dev = net->loopback_dev; netdev_hold(dev, dev_tracker, gfp_flags); idev = in6_dev_get(dev); if (!idev) { err = -ENODEV; goto out; } } goto pcpu_alloc; } if (cfg->fc_flags & RTF_GATEWAY) { err = ip6_validate_gw(net, cfg, &dev, dev_tracker, &idev, extack); if (err) goto out; fib6_nh->fib_nh_gw6 = cfg->fc_gateway; fib6_nh->fib_nh_gw_family = AF_INET6; } err = -ENODEV; if (!dev) goto out; if (!idev || idev->cnf.disable_ipv6) { NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device"); err = -EACCES; goto out; } if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; goto out; } if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) && !netif_carrier_ok(dev)) fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap, cfg->fc_encap_type, cfg, gfp_flags, extack); if (err) goto out; pcpu_alloc: fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); if (!fib6_nh->rt6i_pcpu) { err = -ENOMEM; goto out; } fib6_nh->fib_nh_dev = dev; fib6_nh->fib_nh_oif = dev->ifindex; err = 0; out: if (idev) in6_dev_put(idev); if (err) { fib_nh_common_release(&fib6_nh->nh_common); fib6_nh->nh_common.nhc_pcpu_rth_output = NULL; fib6_nh->fib_nh_lws = NULL; netdev_put(dev, dev_tracker); } return err; } void fib6_nh_release(struct fib6_nh *fib6_nh) { struct rt6_exception_bucket *bucket; rcu_read_lock(); fib6_nh_flush_exceptions(fib6_nh, NULL); bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL); if (bucket) { rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL); kfree(bucket); } rcu_read_unlock(); fib6_nh_release_dsts(fib6_nh); free_percpu(fib6_nh->rt6i_pcpu); fib_nh_common_release(&fib6_nh->nh_common); } void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) { int cpu; if (!fib6_nh->rt6i_pcpu) return; for_each_possible_cpu(cpu) { struct rt6_info *pcpu_rt, **ppcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = xchg(ppcpu_rt, NULL); if (pcpu_rt) { dst_dev_put(&pcpu_rt->dst); dst_release(&pcpu_rt->dst); } } } static int fib6_config_validate(struct fib6_config *cfg, struct netlink_ext_ack *extack) { /* RTF_PCPU is an internal flag; can not be set by userspace */ if (cfg->fc_flags & RTF_PCPU) { NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); goto errout; } /* RTF_CACHE is an internal flag; can not be set by userspace */ if (cfg->fc_flags & RTF_CACHE) { NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); goto errout; } if (cfg->fc_type > RTN_MAX) { NL_SET_ERR_MSG(extack, "Invalid route type"); goto errout; } if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); goto errout; } #ifdef CONFIG_IPV6_SUBTREES if (cfg->fc_src_len > 128) { NL_SET_ERR_MSG(extack, "Invalid source address length"); goto errout; } if (cfg->fc_nh_id && cfg->fc_src_len) { NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); goto errout; } #else if (cfg->fc_src_len) { NL_SET_ERR_MSG(extack, "Specifying source address requires IPV6_SUBTREES to be enabled"); goto errout; } #endif return 0; errout: return -EINVAL; } static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_table *table; struct fib6_info *rt; int err; if (cfg->fc_nlinfo.nlh && !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { table = fib6_get_table(net, cfg->fc_table); if (!table) { pr_warn("NLM_F_CREATE should be specified when creating new route\n"); table = fib6_new_table(net, cfg->fc_table); } } else { table = fib6_new_table(net, cfg->fc_table); } if (!table) { err = -ENOBUFS; goto err; } rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id); if (!rt) { err = -ENOMEM; goto err; } rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(rt->fib6_metrics)) { err = PTR_ERR(rt->fib6_metrics); goto free; } if (cfg->fc_flags & RTF_ADDRCONF) rt->dst_nocount = true; if (cfg->fc_flags & RTF_EXPIRES) fib6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; rt->fib6_protocol = cfg->fc_protocol; rt->fib6_table = table; rt->fib6_metric = cfg->fc_metric; rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY; ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->fib6_dst.plen = cfg->fc_dst_len; #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif return rt; free: kfree(rt); err: return ERR_PTR(err); } static int ip6_route_info_create_nh(struct fib6_info *rt, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; struct fib6_nh *fib6_nh; int err; if (cfg->fc_nh_id) { struct nexthop *nh; rcu_read_lock(); nh = nexthop_find_by_id(net, cfg->fc_nh_id); if (!nh) { err = -EINVAL; NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto out_free; } err = fib6_check_nexthop(nh, cfg, extack); if (err) goto out_free; if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); err = -ENOENT; goto out_free; } rt->nh = nh; fib6_nh = nexthop_fib6_nh(rt->nh); rcu_read_unlock(); } else { int addr_type; err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); if (err) goto out_release; fib6_nh = rt->fib6_nh; /* We cannot add true routes via loopback here, they would * result in kernel looping; promote them to reject routes */ addr_type = ipv6_addr_type(&cfg->fc_dst); if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, addr_type)) rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP; } if (!ipv6_addr_any(&cfg->fc_prefsrc)) { struct net_device *dev = fib6_nh->fib_nh_dev; if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); err = -EINVAL; goto out_release; } rt->fib6_prefsrc.addr = cfg->fc_prefsrc; rt->fib6_prefsrc.plen = 128; } return 0; out_release: fib6_info_release(rt); return err; out_free: rcu_read_unlock(); ip_fib_metrics_put(rt->fib6_metrics); kfree(rt); return err; } int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct fib6_info *rt; int err; err = fib6_config_validate(cfg, extack); if (err) return err; rt = ip6_route_info_create(cfg, gfp_flags, extack); if (IS_ERR(rt)) return PTR_ERR(rt); err = ip6_route_info_create_nh(rt, cfg, gfp_flags, extack); if (err) return err; err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); fib6_info_release(rt); return err; } static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_table *table; int err; if (rt == net->ipv6.fib6_null_entry) { err = -ENOENT; goto out; } table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); err = fib6_del(rt, info); spin_unlock_bh(&table->tb6_lock); out: fib6_info_release(rt); return err; } int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) { struct nl_info info = { .nl_net = net, .skip_notify = skip_notify }; return __ip6_del_rt(rt, &info); } static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg) { struct nl_info *info = &cfg->fc_nlinfo; struct net *net = info->nl_net; struct sk_buff *skb = NULL; struct fib6_table *table; int err = -ENOENT; if (rt == net->ipv6.fib6_null_entry) goto out_put; table = rt->fib6_table; spin_lock_bh(&table->tb6_lock); if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) { struct fib6_info *sibling, *next_sibling; struct fib6_node *fn; /* prefer to send a single notification with all hops */ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (skb) { u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; if (rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, RTM_DELROUTE, info->portid, seq, 0) < 0) { kfree_skb(skb); skb = NULL; } else info->skip_notify = 1; } /* 'rt' points to the first sibling route. If it is not the * leaf, then we do not need to send a notification. Otherwise, * we need to check if the last sibling has a next route or not * and emit a replace or delete notification, respectively. */ info->skip_notify_kernel = 1; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (rcu_access_pointer(fn->leaf) == rt) { struct fib6_info *last_sibling, *replace_rt; last_sibling = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); replace_rt = rcu_dereference_protected( last_sibling->fib6_next, lockdep_is_held(&table->tb6_lock)); if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); else call_fib6_multipath_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, rt->fib6_nsiblings, NULL); } list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { err = fib6_del(sibling, info); if (err) goto out_unlock; } } err = fib6_del(rt, info); out_unlock: spin_unlock_bh(&table->tb6_lock); out_put: fib6_info_release(rt); if (skb) { rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any()); } return err; } static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg) { int rc = -ESRCH; if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex) goto out; if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway)) goto out; rc = rt6_remove_exception_rt(rt); out: return rc; } static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt, struct fib6_nh *nh) { struct fib6_result res = { .f6i = rt, .nh = nh, }; struct rt6_info *rt_cache; rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src); if (rt_cache) return __ip6_del_cached_rt(rt_cache, cfg); return 0; } struct fib6_nh_del_cached_rt_arg { struct fib6_config *cfg; struct fib6_info *f6i; }; static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg) { struct fib6_nh_del_cached_rt_arg *arg = _arg; int rc; rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh); return rc != -ESRCH ? rc : 0; } static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i) { struct fib6_nh_del_cached_rt_arg arg = { .cfg = cfg, .f6i = f6i }; return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg); } static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_table *table; struct fib6_info *rt; struct fib6_node *fn; int err = -ESRCH; table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); if (!table) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); return err; } rcu_read_lock(); fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, &cfg->fc_src, cfg->fc_src_len, !(cfg->fc_flags & RTF_CACHE)); if (fn) { for_each_fib6_node_rt_rcu(fn) { struct fib6_nh *nh; if (rt->nh && cfg->fc_nh_id && rt->nh->id != cfg->fc_nh_id) continue; if (cfg->fc_flags & RTF_CACHE) { int rc = 0; if (rt->nh) { rc = ip6_del_cached_rt_nh(cfg, rt); } else if (cfg->fc_nh_id) { continue; } else { nh = rt->fib6_nh; rc = ip6_del_cached_rt(cfg, rt, nh); } if (rc != -ESRCH) { rcu_read_unlock(); return rc; } continue; } if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric) continue; if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol) continue; if (rt->nh) { if (!fib6_info_hold_safe(rt)) continue; err = __ip6_del_rt(rt, &cfg->fc_nlinfo); break; } if (cfg->fc_nh_id) continue; nh = rt->fib6_nh; if (cfg->fc_ifindex && (!nh->fib_nh_dev || nh->fib_nh_dev->ifindex != cfg->fc_ifindex)) continue; if (cfg->fc_flags & RTF_GATEWAY && !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6)) continue; if (!fib6_info_hold_safe(rt)) continue; /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) err = __ip6_del_rt(rt, &cfg->fc_nlinfo); else err = __ip6_del_rt_siblings(rt, cfg); break; } } rcu_read_unlock(); return err; } static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct netevent_redirect netevent; struct rt6_info *rt, *nrt = NULL; struct fib6_result res = {}; struct ndisc_options ndopts; struct inet6_dev *in6_dev; struct neighbour *neigh; struct rd_msg *msg; int optlen, on_link; u8 *lladdr; optlen = skb_tail_pointer(skb) - skb_transport_header(skb); optlen -= sizeof(*msg); if (optlen < 0) { net_dbg_ratelimited("rt6_do_redirect: packet too short\n"); return; } msg = (struct rd_msg *)icmp6_hdr(skb); if (ipv6_addr_is_multicast(&msg->dest)) { net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n"); return; } on_link = 0; if (ipv6_addr_equal(&msg->dest, &msg->target)) { on_link = 1; } else if (ipv6_addr_type(&msg->target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n"); return; } in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) return; if (READ_ONCE(in6_dev->cnf.forwarding) || !READ_ONCE(in6_dev->cnf.accept_redirects)) return; /* RFC2461 8.1: * The IP source address of the Redirect MUST be the same as the current * first-hop router for the specified ICMP Destination Address. */ if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) { net_dbg_ratelimited("rt6_redirect: invalid ND options\n"); return; } lladdr = NULL; if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, skb->dev); if (!lladdr) { net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n"); return; } } rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_REJECT) { net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n"); return; } /* Redirect received -> path was valid. * Look, redirects are sent only in response to data packets, * so that this nexthop apparently is reachable. --ANK */ dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); if (!neigh) return; /* * We have finally decided to accept it. */ ndisc_update(skb->dev, neigh, lladdr, NUD_STALE, NEIGH_UPDATE_F_WEAK_OVERRIDE| NEIGH_UPDATE_F_OVERRIDE| (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| NEIGH_UPDATE_F_ISROUTER)), NDISC_REDIRECT, &ndopts); rcu_read_lock(); res.f6i = rcu_dereference(rt->from); if (!res.f6i) goto out; if (res.f6i->nh) { struct fib6_nh_match_arg arg = { .dev = dst_dev_rcu(dst), .gw = &rt->rt6i_gateway, }; nexthop_for_each_fib6_nh(res.f6i->nh, fib6_nh_find_match, &arg); /* fib6_info uses a nexthop that does not have fib6_nh * using the dst->dev. Should be impossible */ if (!arg.match) goto out; res.nh = arg.match; } else { res.nh = res.f6i->fib6_nh; } res.fib6_flags = res.f6i->fib6_flags; res.fib6_type = res.f6i->fib6_type; nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL); if (!nrt) goto out; nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; /* rt6_insert_exception() will take care of duplicated exceptions */ if (rt6_insert_exception(nrt, &res)) { dst_release_immediate(&nrt->dst); goto out; } netevent.old = &rt->dst; netevent.new = &nrt->dst; netevent.daddr = &msg->dest; netevent.neigh = neigh; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); out: rcu_read_unlock(); neigh_release(neigh); } #ifdef CONFIG_IPV6_ROUTE_INFO static struct fib6_info *rt6_get_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev) { u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; int ifindex = dev->ifindex; struct fib6_node *fn; struct fib6_info *rt = NULL; struct fib6_table *table; table = fib6_get_table(net, tb_id); if (!table) return NULL; rcu_read_lock(); fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); if (!fn) goto out; for_each_fib6_node_rt_rcu(fn) { /* these routes do not use nexthops */ if (rt->nh) continue; if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex) continue; if (!(rt->fib6_flags & RTF_ROUTEINFO) || !rt->fib6_nh->fib_nh_gw_family) continue; if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr)) continue; if (!fib6_info_hold_safe(rt)) continue; break; } out: rcu_read_unlock(); return rt; } static struct fib6_info *rt6_add_route_info(struct net *net, const struct in6_addr *prefix, int prefixlen, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref) { struct fib6_config cfg = { .fc_metric = IP6_RT_PRIO_USER, .fc_ifindex = dev->ifindex, .fc_dst_len = prefixlen, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), .fc_protocol = RTPROT_RA, .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, }; cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO; cfg.fc_dst = *prefix; cfg.fc_gateway = *gwaddr; /* We should treat it as a default route if prefix length is 0. */ if (!prefixlen) cfg.fc_flags |= RTF_DEFAULT; ip6_route_add(&cfg, GFP_ATOMIC, NULL); return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev); } #endif struct fib6_info *rt6_get_dflt_router(struct net *net, const struct in6_addr *addr, struct net_device *dev) { u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT; struct fib6_info *rt; struct fib6_table *table; table = fib6_get_table(net, tb_id); if (!table) return NULL; rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { struct fib6_nh *nh; /* RA routes do not use nexthops */ if (rt->nh) continue; nh = rt->fib6_nh; if (dev == nh->fib_nh_dev && ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&nh->fib_nh_gw6, addr)) break; } if (rt && !fib6_info_hold_safe(rt)) rt = NULL; rcu_read_unlock(); return rt; } struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref, u32 defrtr_usr_metric, int lifetime) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, .fc_metric = defrtr_usr_metric, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), .fc_protocol = RTPROT_RA, .fc_type = RTN_UNICAST, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, .fc_expires = jiffies_to_clock_t(lifetime * HZ), }; cfg.fc_gateway = *gwaddr; if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) { struct fib6_table *table; table = fib6_get_table(dev_net(dev), cfg.fc_table); if (table) table->flags |= RT6_TABLE_HAS_DFLT_ROUTER; } return rt6_get_dflt_router(net, gwaddr, dev); } static void __rt6_purge_dflt_routers(struct net *net, struct fib6_table *table) { struct fib6_info *rt; restart: rcu_read_lock(); for_each_fib6_node_rt_rcu(&table->tb6_root) { struct net_device *dev = fib6_info_nh_dev(rt); struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL; if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) && (!idev || idev->cnf.accept_ra != 2) && fib6_info_hold_safe(rt)) { rcu_read_unlock(); ip6_del_rt(net, rt, false); goto restart; } } rcu_read_unlock(); table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; } void rt6_purge_dflt_routers(struct net *net) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER) __rt6_purge_dflt_routers(net, table); } } rcu_read_unlock(); } static void rtmsg_to_fib6_config(struct net *net, struct in6_rtmsg *rtmsg, struct fib6_config *cfg) { *cfg = (struct fib6_config){ .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ? : RT6_TABLE_MAIN, .fc_ifindex = rtmsg->rtmsg_ifindex, .fc_metric = rtmsg->rtmsg_metric, .fc_expires = rtmsg->rtmsg_info, .fc_dst_len = rtmsg->rtmsg_dst_len, .fc_src_len = rtmsg->rtmsg_src_len, .fc_flags = rtmsg->rtmsg_flags, .fc_type = rtmsg->rtmsg_type, .fc_nlinfo.nl_net = net, .fc_dst = rtmsg->rtmsg_dst, .fc_src = rtmsg->rtmsg_src, .fc_gateway = rtmsg->rtmsg_gateway, }; } int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) { struct fib6_config cfg; int err; if (cmd != SIOCADDRT && cmd != SIOCDELRT) return -EINVAL; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtmsg_to_fib6_config(net, rtmsg, &cfg); switch (cmd) { case SIOCADDRT: /* Only do the default setting of fc_metric in route adding */ if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; err = ip6_route_add(&cfg, GFP_KERNEL, NULL); break; case SIOCDELRT: err = ip6_route_del(&cfg, NULL); break; } return err; } /* * Drop the packet on the floor */ static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst_dev(dst); struct net *net = dev_net(dev); struct inet6_dev *idev; SKB_DR(reason); int type; if (netif_is_l3_master(skb->dev) || dev == net->loopback_dev) idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); else idev = ip6_dst_idev(dst); switch (ipstats_mib_noroutes) { case IPSTATS_MIB_INNOROUTES: type = ipv6_addr_type(&ipv6_hdr(skb)->daddr); if (type == IPV6_ADDR_ANY) { SKB_DR_SET(reason, IP_INADDRERRORS); IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); break; } SKB_DR_SET(reason, IP_INNOROUTES); fallthrough; case IPSTATS_MIB_OUTNOROUTES: SKB_DR_OR(reason, IP_OUTNOROUTES); IP6_INC_STATS(net, idev, ipstats_mib_noroutes); break; } /* Start over by dropping the dst for l3mdev case */ if (netif_is_l3_master(skb->dev)) skb_dst_drop(skb); icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0); kfree_skb_reason(skb, reason); return 0; } static int ip6_pkt_discard(struct sk_buff *skb) { return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES); } static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst_dev(skb); return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); } static int ip6_pkt_prohibit(struct sk_buff *skb) { return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES); } static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) { skb->dev = skb_dst_dev(skb); return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); } /* * Allocate a dst for local (unicast / anycast) address. */ struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast, gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL, .fc_ifindex = idev->dev->ifindex, .fc_flags = RTF_UP | RTF_NONEXTHOP, .fc_dst = *addr, .fc_dst_len = 128, .fc_protocol = RTPROT_KERNEL, .fc_nlinfo.nl_net = net, .fc_ignore_dev_down = true, }; struct fib6_info *f6i; int err; if (anycast) { cfg.fc_type = RTN_ANYCAST; cfg.fc_flags |= RTF_ANYCAST; } else { cfg.fc_type = RTN_LOCAL; cfg.fc_flags |= RTF_LOCAL; } f6i = ip6_route_info_create(&cfg, gfp_flags, extack); if (IS_ERR(f6i)) return f6i; err = ip6_route_info_create_nh(f6i, &cfg, gfp_flags, extack); if (err) return ERR_PTR(err); f6i->dst_nocount = true; if (!anycast && (READ_ONCE(net->ipv6.devconf_all->disable_policy) || READ_ONCE(idev->cnf.disable_policy))) f6i->dst_nopolicy = true; return f6i; } /* remove deleted ip from prefsrc entries */ struct arg_dev_net_ip { struct net *net; struct in6_addr *addr; }; static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg) { struct net *net = ((struct arg_dev_net_ip *)arg)->net; struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr; if (!rt->nh && rt != net->ipv6.fib6_null_entry && ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) && !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) { spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ rt->fib6_prefsrc.plen = 0; spin_unlock_bh(&rt6_exception_lock); } return 0; } void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) { struct net *net = dev_net(ifp->idev->dev); struct arg_dev_net_ip adni = { .net = net, .addr = &ifp->addr, }; fib6_clean_all(net, fib6_remove_prefsrc, &adni); } #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT) /* Remove routers and update dst entries when gateway turn into host. */ static int fib6_clean_tohost(struct fib6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; struct fib6_nh *nh; /* RA routes do not use nexthops */ if (rt->nh) return 0; nh = rt->fib6_nh; if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6)) return -1; /* Further clean up cached routes in exception table. * This is needed because cached route may have a different * gateway than its 'parent' in the case of an ip redirect. */ fib6_nh_exceptions_clean_tohost(nh, gateway); return 0; } void rt6_clean_tohost(struct net *net, struct in6_addr *gateway) { fib6_clean_all(net, fib6_clean_tohost, gateway); } struct arg_netdev_event { const struct net_device *dev; union { unsigned char nh_flags; unsigned long event; }; }; static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt) { struct fib6_info *iter; struct fib6_node *fn; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); iter = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(iter)) return iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); } return NULL; } /* only called for fib entries with builtin fib6_nh */ static bool rt6_is_dead(const struct fib6_info *rt) { if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD || (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN && ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev))) return true; return false; } static int rt6_multipath_total_weight(const struct fib6_info *rt) { struct fib6_info *iter; int total = 0; if (!rt6_is_dead(rt)) total += rt->fib6_nh->fib_nh_weight; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) { if (!rt6_is_dead(iter)) total += iter->fib6_nh->fib_nh_weight; } return total; } static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total) { int upper_bound = -1; if (!rt6_is_dead(rt)) { *weight += rt->fib6_nh->fib_nh_weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31, total) - 1; } atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound); } static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total) { struct fib6_info *iter; int weight = 0; rt6_upper_bound_set(rt, &weight, total); list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) rt6_upper_bound_set(iter, &weight, total); } void rt6_multipath_rebalance(struct fib6_info *rt) { struct fib6_info *first; int total; /* In case the entire multipath route was marked for flushing, * then there is no need to rebalance upon the removal of every * sibling route. */ if (!rt->fib6_nsiblings || rt->should_flush) return; /* During lookup routes are evaluated in order, so we need to * make sure upper bounds are assigned from the first sibling * onwards. */ first = rt6_multipath_first_sibling(rt); if (WARN_ON_ONCE(!first)) return; total = rt6_multipath_total_weight(first); rt6_multipath_upper_bound_set(first, total); } static int fib6_ifup(struct fib6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; struct net *net = dev_net(arg->dev); if (rt != net->ipv6.fib6_null_entry && !rt->nh && rt->fib6_nh->fib_nh_dev == arg->dev) { rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags; fib6_update_sernum_upto_root(net, rt); rt6_multipath_rebalance(rt); } return 0; } void rt6_sync_up(struct net_device *dev, unsigned char nh_flags) { struct arg_netdev_event arg = { .dev = dev, { .nh_flags = nh_flags, }, }; if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev)) arg.nh_flags |= RTNH_F_LINKDOWN; fib6_clean_all(dev_net(dev), fib6_ifup, &arg); } /* only called for fib entries with inline fib6_nh */ static bool rt6_multipath_uses_dev(const struct fib6_info *rt, const struct net_device *dev) { struct fib6_info *iter; if (rt->fib6_nh->fib_nh_dev == dev) return true; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh->fib_nh_dev == dev) return true; return false; } static void rt6_multipath_flush(struct fib6_info *rt) { struct fib6_info *iter; rt->should_flush = 1; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) iter->should_flush = 1; } static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt, const struct net_device *down_dev) { struct fib6_info *iter; unsigned int dead = 0; if (rt->fib6_nh->fib_nh_dev == down_dev || rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh->fib_nh_dev == down_dev || iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD) dead++; return dead; } static void rt6_multipath_nh_flags_set(struct fib6_info *rt, const struct net_device *dev, unsigned char nh_flags) { struct fib6_info *iter; if (rt->fib6_nh->fib_nh_dev == dev) rt->fib6_nh->fib_nh_flags |= nh_flags; list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) if (iter->fib6_nh->fib_nh_dev == dev) iter->fib6_nh->fib_nh_flags |= nh_flags; } /* called with write lock held for table with rt */ static int fib6_ifdown(struct fib6_info *rt, void *p_arg) { const struct arg_netdev_event *arg = p_arg; const struct net_device *dev = arg->dev; struct net *net = dev_net(dev); if (rt == net->ipv6.fib6_null_entry || rt->nh) return 0; switch (arg->event) { case NETDEV_UNREGISTER: return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; case NETDEV_DOWN: if (rt->should_flush) return -1; if (!rt->fib6_nsiblings) return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0; if (rt6_multipath_uses_dev(rt, dev)) { unsigned int count; count = rt6_multipath_dead_count(rt, dev); if (rt->fib6_nsiblings + 1 == count) { rt6_multipath_flush(rt); return -1; } rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD | RTNH_F_LINKDOWN); fib6_update_sernum(net, rt); rt6_multipath_rebalance(rt); } return -2; case NETDEV_CHANGE: if (rt->fib6_nh->fib_nh_dev != dev || rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) break; rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN; rt6_multipath_rebalance(rt); break; } return 0; } void rt6_sync_down_dev(struct net_device *dev, unsigned long event) { struct arg_netdev_event arg = { .dev = dev, { .event = event, }, }; struct net *net = dev_net(dev); if (net->ipv6.sysctl.skip_notify_on_dev_down) fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); else fib6_clean_all(net, fib6_ifdown, &arg); } void rt6_disable_ip(struct net_device *dev, unsigned long event) { rt6_sync_down_dev(dev, event); rt6_uncached_list_flush_dev(dev); neigh_ifdown(&nd_tbl, dev); } struct rt6_mtu_change_arg { struct net_device *dev; unsigned int mtu; struct fib6_info *f6i; }; static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg; struct fib6_info *f6i = arg->f6i; /* For administrative MTU increase, there is no way to discover * IPv6 PMTU increase, so PMTU increase should be updated here. * Since RFC 1981 doesn't include administrative MTU increase * update PMTU increase is a MUST. (i.e. jumbo frame) */ if (nh->fib_nh_dev == arg->dev) { struct inet6_dev *idev = __in6_dev_get(arg->dev); u32 mtu = f6i->fib6_pmtu; if (mtu >= arg->mtu || (mtu < arg->mtu && mtu == idev->cnf.mtu6)) fib6_metric_set(f6i, RTAX_MTU, arg->mtu); spin_lock_bh(&rt6_exception_lock); rt6_exceptions_update_pmtu(idev, nh, arg->mtu); spin_unlock_bh(&rt6_exception_lock); } return 0; } static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) { struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; struct inet6_dev *idev; /* In IPv6 pmtu discovery is not optional, so that RTAX_MTU lock cannot disable it. We still use this lock to block changes caused by addrconf/ndisc. */ idev = __in6_dev_get(arg->dev); if (!idev) return 0; if (fib6_metric_locked(f6i, RTAX_MTU)) return 0; arg->f6i = f6i; if (f6i->nh) { /* fib6_nh_mtu_change only returns 0, so this is safe */ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change, arg); } return fib6_nh_mtu_change(f6i->fib6_nh, arg); } void rt6_mtu_change(struct net_device *dev, unsigned int mtu) { struct rt6_mtu_change_arg arg = { .dev = dev, .mtu = mtu, }; fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg); } static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 }, [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) }, [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) }, [RTA_OIF] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, [RTA_PRIORITY] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_PREF] = { .type = NLA_U8 }, [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, [RTA_ENCAP] = { .type = NLA_NESTED }, [RTA_EXPIRES] = { .type = NLA_U32 }, [RTA_UID] = { .type = NLA_U32 }, [RTA_MARK] = { .type = NLA_U32 }, [RTA_TABLE] = { .type = NLA_U32 }, [RTA_IP_PROTO] = { .type = NLA_U8 }, [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, [RTA_FLOWLABEL] = { .type = NLA_BE32 }, }; static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, struct netlink_ext_ack *extack, bool newroute) { struct rtnexthop *rtnh; int remaining; remaining = cfg->fc_mp_len; rtnh = (struct rtnexthop *)cfg->fc_mp; if (!rtnh_ok(rtnh, remaining)) { NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - no valid nexthops"); return -EINVAL; } do { bool has_gateway = cfg->fc_flags & RTF_GATEWAY; int attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs; attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { if (nla_len(nla) < sizeof(cfg->fc_gateway)) { NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY"); return -EINVAL; } has_gateway = true; } } if (newroute && (cfg->fc_nh_id || !has_gateway)) { NL_SET_ERR_MSG(extack, "Device only routes can not be added for IPv6 using the multipath API."); return -EINVAL; } rtnh = rtnh_next(rtnh, &remaining); } while (rtnh_ok(rtnh, remaining)); return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, extack); } static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { bool newroute = nlh->nlmsg_type == RTM_NEWROUTE; struct nlattr *tb[RTA_MAX+1]; struct rtmsg *rtm; unsigned int pref; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); if (err < 0) goto errout; err = -EINVAL; rtm = nlmsg_data(nlh); if (rtm->rtm_tos) { NL_SET_ERR_MSG(extack, "Invalid dsfield (tos): option not available for IPv6"); goto errout; } if (tb[RTA_FLOWLABEL]) { NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], "Flow label cannot be specified for this operation"); goto errout; } *cfg = (struct fib6_config){ .fc_table = rtm->rtm_table, .fc_dst_len = rtm->rtm_dst_len, .fc_src_len = rtm->rtm_src_len, .fc_flags = RTF_UP, .fc_protocol = rtm->rtm_protocol, .fc_type = rtm->rtm_type, .fc_nlinfo.portid = NETLINK_CB(skb).portid, .fc_nlinfo.nlh = nlh, .fc_nlinfo.nl_net = sock_net(skb->sk), }; if (rtm->rtm_type == RTN_UNREACHABLE || rtm->rtm_type == RTN_BLACKHOLE || rtm->rtm_type == RTN_PROHIBIT || rtm->rtm_type == RTN_THROW) cfg->fc_flags |= RTF_REJECT; if (rtm->rtm_type == RTN_LOCAL) cfg->fc_flags |= RTF_LOCAL; if (rtm->rtm_flags & RTM_F_CLONED) cfg->fc_flags |= RTF_CACHE; cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK); if (tb[RTA_NH_ID]) { if (tb[RTA_GATEWAY] || tb[RTA_OIF] || tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); goto errout; } cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]); } if (tb[RTA_GATEWAY]) { cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; } if (tb[RTA_VIA]) { NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); goto errout; } if (tb[RTA_DST]) { int plen = (rtm->rtm_dst_len + 7) >> 3; if (nla_len(tb[RTA_DST]) < plen) goto errout; nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen); } if (tb[RTA_SRC]) { int plen = (rtm->rtm_src_len + 7) >> 3; if (nla_len(tb[RTA_SRC]) < plen) goto errout; nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen); } if (tb[RTA_PREFSRC]) cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]); if (tb[RTA_OIF]) cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]); if (tb[RTA_PRIORITY]) cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]); if (tb[RTA_METRICS]) { cfg->fc_mx = nla_data(tb[RTA_METRICS]); cfg->fc_mx_len = nla_len(tb[RTA_METRICS]); } if (tb[RTA_TABLE]) cfg->fc_table = nla_get_u32(tb[RTA_TABLE]); if (tb[RTA_MULTIPATH]) { cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); err = rtm_to_fib6_multipath_config(cfg, extack, newroute); if (err < 0) goto errout; } if (tb[RTA_PREF]) { pref = nla_get_u8(tb[RTA_PREF]); if (pref != ICMPV6_ROUTER_PREF_LOW && pref != ICMPV6_ROUTER_PREF_HIGH) pref = ICMPV6_ROUTER_PREF_MEDIUM; cfg->fc_flags |= RTF_PREF(pref); } if (tb[RTA_ENCAP]) cfg->fc_encap = tb[RTA_ENCAP]; if (tb[RTA_ENCAP_TYPE]) { cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); if (err < 0) goto errout; } if (tb[RTA_EXPIRES]) { unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ); if (addrconf_finite_timeout(timeout)) { cfg->fc_expires = jiffies_to_clock_t(timeout * HZ); cfg->fc_flags |= RTF_EXPIRES; } } err = 0; errout: return err; } struct rt6_nh { struct fib6_info *fib6_info; struct fib6_config r_cfg; struct list_head list; }; static int ip6_route_info_append(struct list_head *rt6_nh_list, struct fib6_info *rt, struct fib6_config *r_cfg) { struct rt6_nh *nh; list_for_each_entry(nh, rt6_nh_list, list) { /* check if fib6_info already exists */ if (rt6_duplicate_nexthop(nh->fib6_info, rt)) return -EEXIST; } nh = kzalloc(sizeof(*nh), GFP_KERNEL); if (!nh) return -ENOMEM; nh->fib6_info = rt; memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); list_add_tail(&nh->list, rt6_nh_list); return 0; } static void ip6_route_mpath_notify(struct fib6_info *rt, struct fib6_info *rt_last, struct nl_info *info, __u16 nlflags) { /* if this is an APPEND route, then rt points to the first route * inserted and rt_last points to last route inserted. Userspace * wants a consistent dump of the route which starts at the first * nexthop. Since sibling routes are always added at the end of * the list, find the first sibling of the last route appended */ rcu_read_lock(); if ((nlflags & NLM_F_APPEND) && rt_last && READ_ONCE(rt_last->fib6_nsiblings)) { rt = list_first_or_null_rcu(&rt_last->fib6_siblings, struct fib6_info, fib6_siblings); } if (rt) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); rcu_read_unlock(); } static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) { bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool should_notify = false; struct fib6_info *leaf; struct fib6_node *fn; rcu_read_lock(); fn = rcu_dereference(rt->fib6_node); if (!fn) goto out; leaf = rcu_dereference(fn->leaf); if (!leaf) goto out; if (rt == leaf || (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf))) should_notify = true; out: rcu_read_unlock(); return should_notify; } static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; struct rt6_nh *nh, *nh_safe; struct fib6_config r_cfg; struct rtnexthop *rtnh; LIST_HEAD(rt6_nh_list); struct rt6_nh *err_nh; struct fib6_info *rt; __u16 nlflags; int remaining; int attrlen; int replace; int nhn = 0; int err; err = fib6_config_validate(cfg, extack); if (err) return err; replace = (cfg->fc_nlinfo.nlh && (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; remaining = cfg->fc_mp_len; rtnh = (struct rtnexthop *)cfg->fc_mp; /* Parse a Multipath Entry and build a list (rt6_nh_list) of * fib6_info structs per nexthop */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); if (rtnh->rtnh_ifindex) r_cfg.fc_ifindex = rtnh->rtnh_ifindex; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla) r_cfg.fc_encap_type = nla_get_u16(nla); } r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; goto cleanup; } err = ip6_route_info_create_nh(rt, &r_cfg, GFP_KERNEL, extack); if (err) { rt = NULL; goto cleanup; } rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { fib6_info_release(rt); goto cleanup; } rtnh = rtnh_next(rtnh, &remaining); } /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with * the full route when done */ info->skip_notify = 1; /* For add and replace, send one notification with all nexthops. For * append, send one notification with all appended nexthops. */ info->skip_notify_kernel = 1; err_nh = NULL; list_for_each_entry(nh, &rt6_nh_list, list) { err = __ip6_ins_rt(nh->fib6_info, info, extack); if (err) { if (replace && nhn) NL_SET_ERR_MSG_MOD(extack, "multipath route replace failed (check consistency of installed routes)"); err_nh = nh; goto add_errout; } /* save reference to last route successfully inserted */ rt_last = nh->fib6_info; /* save reference to first route for notification */ if (!rt_notif) rt_notif = nh->fib6_info; /* Because each route is added like a single route we remove * these flags after the first nexthop: if there is a collision, * we have already failed to add the first nexthop: * fib6_add_rt2node() has rejected it; when replacing, old * nexthops have been replaced by first new, the rest should * be added to it. */ if (cfg->fc_nlinfo.nlh) { cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL | NLM_F_REPLACE); cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE; } nhn++; } /* An in-kernel notification should only be sent in case the new * multipath route is added as the first route in the node, or if * it was appended to it. We pass 'rt_notif' since it is the first * sibling and might allow us to skip some checks in the replace case. */ if (ip6_route_mpath_should_notify(rt_notif)) { enum fib_event_type fib_event; if (rt_notif->fib6_nsiblings != nhn - 1) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_multipath_entry_notifiers(info->nl_net, fib_event, rt_notif, nhn - 1, extack); if (err) { /* Delete all the siblings that were just added */ err_nh = NULL; goto add_errout; } } /* success ... tell user about new route */ ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); goto cleanup; add_errout: /* send notification for routes that were added so that * the delete notifications sent by ip6_route_del are * coherent */ if (rt_notif) ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); /* Delete routes that were already added */ list_for_each_entry(nh, &rt6_nh_list, list) { if (err_nh == nh) break; ip6_route_del(&nh->r_cfg, extack); } cleanup: list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, list) { fib6_info_release(nh->fib6_info); list_del(&nh->list); kfree(nh); } return err; } static int ip6_route_multipath_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_config r_cfg; struct rtnexthop *rtnh; int last_err = 0; int remaining; int attrlen; int err; remaining = cfg->fc_mp_len; rtnh = (struct rtnexthop *)cfg->fc_mp; /* Parse a Multipath Entry */ while (rtnh_ok(rtnh, remaining)) { memcpy(&r_cfg, cfg, sizeof(*cfg)); if (rtnh->rtnh_ifindex) r_cfg.fc_ifindex = rtnh->rtnh_ifindex; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } } err = ip6_route_del(&r_cfg, extack); if (err) last_err = err; rtnh = rtnh_next(rtnh, &remaining); } return last_err; } static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib6_config cfg; int err; err = rtm_to_fib6_config(skb, nlh, &cfg, extack); if (err < 0) return err; if (cfg.fc_nh_id) { rcu_read_lock(); err = !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id); rcu_read_unlock(); if (err) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); return -EINVAL; } } if (cfg.fc_mp) { return ip6_route_multipath_del(&cfg, extack); } else { cfg.fc_delete_all_nh = 1; return ip6_route_del(&cfg, extack); } } static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct fib6_config cfg; int err; err = rtm_to_fib6_config(skb, nlh, &cfg, extack); if (err < 0) return err; if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; if (cfg.fc_mp) return ip6_route_multipath_add(&cfg, extack); else return ip6_route_add(&cfg, GFP_KERNEL, extack); } /* add the overhead of this fib6_nh to nexthop_len */ static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg) { int *nexthop_len = arg; *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */ + NLA_ALIGN(sizeof(struct rtnexthop)) + nla_total_size(16); /* RTA_GATEWAY */ if (nh->fib_nh_lws) { /* RTA_ENCAP_TYPE */ *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); /* RTA_ENCAP */ *nexthop_len += nla_total_size(2); } return 0; } static size_t rt6_nlmsg_size(struct fib6_info *f6i) { struct fib6_info *sibling; struct fib6_nh *nh; int nexthop_len; if (f6i->nh) { nexthop_len = nla_total_size(4); /* RTA_NH_ID */ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, &nexthop_len); goto common; } rcu_read_lock(); retry: nh = f6i->fib6_nh; nexthop_len = 0; if (READ_ONCE(f6i->fib6_nsiblings)) { rt6_nh_nlmsg_size(nh, &nexthop_len); list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, fib6_siblings) { rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); if (!READ_ONCE(f6i->fib6_nsiblings)) goto retry; } } rcu_read_unlock(); nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); common: return NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(16) /* RTA_SRC */ + nla_total_size(16) /* RTA_DST */ + nla_total_size(16) /* RTA_GATEWAY */ + nla_total_size(16) /* RTA_PREFSRC */ + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(4) /* RTA_IIF */ + nla_total_size(4) /* RTA_OIF */ + nla_total_size(4) /* RTA_PRIORITY */ + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ + nexthop_len; } static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh, unsigned char *flags) { if (nexthop_is_multipath(nh)) { struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; if (nexthop_mpath_fill_node(skb, nh, AF_INET6)) goto nla_put_failure; nla_nest_end(skb, mp); } else { struct fib6_nh *fib6_nh; fib6_nh = nexthop_fib6_nh(nh); if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6, flags, false) < 0) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static int rt6_fill_node(struct net *net, struct sk_buff *skb, struct fib6_info *rt, struct dst_entry *dst, struct in6_addr *dest, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags) { struct rt6_info *rt6 = dst_rt6_info(dst); struct rt6key *rt6_dst, *rt6_src; u32 *pmetrics, table, rt6_flags; unsigned char nh_flags = 0; struct nlmsghdr *nlh; struct rtmsg *rtm; long expires = 0; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; if (rt6) { rt6_dst = &rt6->rt6i_dst; rt6_src = &rt6->rt6i_src; rt6_flags = rt6->rt6i_flags; } else { rt6_dst = &rt->fib6_dst; rt6_src = &rt->fib6_src; rt6_flags = rt->fib6_flags; } rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET6; rtm->rtm_dst_len = rt6_dst->plen; rtm->rtm_src_len = rt6_src->plen; rtm->rtm_tos = 0; if (rt->fib6_table) table = rt->fib6_table->tb6_id; else table = RT6_TABLE_UNSPEC; rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table)) goto nla_put_failure; rtm->rtm_type = rt->fib6_type; rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->fib6_protocol; if (rt6_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; if (dest) { if (nla_put_in6_addr(skb, RTA_DST, dest)) goto nla_put_failure; rtm->rtm_dst_len = 128; } else if (rtm->rtm_dst_len) if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr)) goto nla_put_failure; #ifdef CONFIG_IPV6_SUBTREES if (src) { if (nla_put_in6_addr(skb, RTA_SRC, src)) goto nla_put_failure; rtm->rtm_src_len = 128; } else if (rtm->rtm_src_len && nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr)) goto nla_put_failure; #endif if (iif) { #ifdef CONFIG_IPV6_MROUTE if (ipv6_addr_is_multicast(&rt6_dst->addr)) { int err = ip6mr_get_route(net, skb, rtm, portid); if (err == 0) return 0; if (err < 0) goto nla_put_failure; } else #endif if (nla_put_u32(skb, RTA_IIF, iif)) goto nla_put_failure; } else if (dest) { struct in6_addr saddr_buf; if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 && nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } if (rt->fib6_prefsrc.plen) { struct in6_addr saddr_buf; saddr_buf = rt->fib6_prefsrc.addr; if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf)) goto nla_put_failure; } pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics; if (rtnetlink_put_metrics(skb, pmetrics) < 0) goto nla_put_failure; if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric)) goto nla_put_failure; /* For multipath routes, walk the siblings list and add * each as a nexthop within RTA_MULTIPATH. */ if (rt6) { struct net_device *dev; if (rt6_flags & RTF_GATEWAY && nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) goto nla_put_failure; dev = dst_dev(dst); if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; if (lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; } else if (READ_ONCE(rt->fib6_nsiblings)) { struct fib6_info *sibling; struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, rt->fib6_nh->fib_nh_weight, AF_INET6, 0) < 0) goto nla_put_failure; rcu_read_lock(); list_for_each_entry_rcu(sibling, &rt->fib6_siblings, fib6_siblings) { if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, sibling->fib6_nh->fib_nh_weight, AF_INET6, 0) < 0) { rcu_read_unlock(); goto nla_put_failure; } } rcu_read_unlock(); nla_nest_end(skb, mp); } else if (rt->nh) { if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) goto nla_put_failure; if (nexthop_is_blackhole(rt->nh)) rtm->rtm_type = RTN_BLACKHOLE; if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) && rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) goto nla_put_failure; rtm->rtm_flags |= nh_flags; } else { if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6, &nh_flags, false) < 0) goto nla_put_failure; rtm->rtm_flags |= nh_flags; } if (rt6_flags & RTF_EXPIRES) { expires = dst ? READ_ONCE(dst->expires) : rt->expires; expires -= jiffies; } if (!dst) { if (READ_ONCE(rt->offload)) rtm->rtm_flags |= RTM_F_OFFLOAD; if (READ_ONCE(rt->trap)) rtm->rtm_flags |= RTM_F_TRAP; if (READ_ONCE(rt->offload_failed)) rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED; } if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0) goto nla_put_failure; if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags))) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg) { const struct net_device *dev = arg; if (nh->fib_nh_dev == dev) return 1; return 0; } static bool fib6_info_uses_dev(const struct fib6_info *f6i, const struct net_device *dev) { if (f6i->nh) { struct net_device *_dev = (struct net_device *)dev; return !!nexthop_for_each_fib6_nh(f6i->nh, fib6_info_nh_uses_dev, _dev); } if (f6i->fib6_nh->fib_nh_dev == dev) return true; if (READ_ONCE(f6i->fib6_nsiblings)) { const struct fib6_info *sibling; rcu_read_lock(); list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, fib6_siblings) { if (sibling->fib6_nh->fib_nh_dev == dev) { rcu_read_unlock(); return true; } if (!READ_ONCE(f6i->fib6_nsiblings)) break; } rcu_read_unlock(); } return false; } struct fib6_nh_exception_dump_walker { struct rt6_rtnl_dump_arg *dump; struct fib6_info *rt; unsigned int flags; unsigned int skip; unsigned int count; }; static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg) { struct fib6_nh_exception_dump_walker *w = arg; struct rt6_rtnl_dump_arg *dump = w->dump; struct rt6_exception_bucket *bucket; struct rt6_exception *rt6_ex; int i, err; bucket = fib6_nh_get_excptn_bucket(nh, NULL); if (!bucket) return 0; for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { if (w->skip) { w->skip--; continue; } /* Expiration of entries doesn't bump sernum, insertion * does. Removal is triggered by insertion, so we can * rely on the fact that if entries change between two * partial dumps, this node is scanned again completely, * see rt6_insert_exception() and fib6_dump_table(). * * Count expired entries we go through as handled * entries that we'll skip next time, in case of partial * node dump. Otherwise, if entries expire meanwhile, * we'll skip the wrong amount. */ if (rt6_check_expired(rt6_ex->rt6i)) { w->count++; continue; } err = rt6_fill_node(dump->net, dump->skb, w->rt, &rt6_ex->rt6i->dst, NULL, NULL, 0, RTM_NEWROUTE, NETLINK_CB(dump->cb->skb).portid, dump->cb->nlh->nlmsg_seq, w->flags); if (err) return err; w->count++; } bucket++; } return 0; } /* Return -1 if done with node, number of handled routes on partial dump */ int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip) { struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; struct fib_dump_filter *filter = &arg->filter; unsigned int flags = NLM_F_MULTI; struct net *net = arg->net; int count = 0; if (rt == net->ipv6.fib6_null_entry) return -1; if ((filter->flags & RTM_F_PREFIX) && !(rt->fib6_flags & RTF_PREFIX_RT)) { /* success since this is not a prefix route */ return -1; } if (filter->filter_set && ((filter->rt_type && rt->fib6_type != filter->rt_type) || (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) || (filter->protocol && rt->fib6_protocol != filter->protocol))) { return -1; } if (filter->filter_set || !filter->dump_routes || !filter->dump_exceptions) { flags |= NLM_F_DUMP_FILTERED; } if (filter->dump_routes) { if (skip) { skip--; } else { if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0, RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq, flags)) { return 0; } count++; } } if (filter->dump_exceptions) { struct fib6_nh_exception_dump_walker w = { .dump = arg, .rt = rt, .flags = flags, .skip = skip, .count = 0 }; int err; rcu_read_lock(); if (rt->nh) { err = nexthop_for_each_fib6_nh(rt->nh, rt6_nh_dump_exceptions, &w); } else { err = rt6_nh_dump_exceptions(rt->fib6_nh, &w); } rcu_read_unlock(); if (err) return count + w.count; } return -1; } static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request"); return -EINVAL; } if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) { NL_SET_ERR_MSG_MOD(extack, "Invalid flags for get route request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); if (err) return err; if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); return -EINVAL; } if (tb[RTA_FLOWLABEL] && (nla_get_be32(tb[RTA_FLOWLABEL]) & ~IPV6_FLOWLABEL_MASK)) { NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], "Invalid flow label"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_SRC: case RTA_DST: case RTA_IIF: case RTA_OIF: case RTA_MARK: case RTA_UID: case RTA_SPORT: case RTA_DPORT: case RTA_IP_PROTO: case RTA_FLOWLABEL: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); return -EINVAL; } } return 0; } static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; int err, iif = 0, oif = 0; struct fib6_info *from; struct dst_entry *dst; struct rt6_info *rt; struct sk_buff *skb; struct rtmsg *rtm; struct flowi6 fl6 = {}; __be32 flowlabel; bool fibmatch; err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) goto errout; err = -EINVAL; rtm = nlmsg_data(nlh); fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); if (tb[RTA_SRC]) { if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr)) goto errout; fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]); } if (tb[RTA_DST]) { if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr)) goto errout; fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]); } if (tb[RTA_IIF]) iif = nla_get_u32(tb[RTA_IIF]); if (tb[RTA_OIF]) oif = nla_get_u32(tb[RTA_OIF]); if (tb[RTA_MARK]) fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]); if (tb[RTA_UID]) fl6.flowi6_uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); else fl6.flowi6_uid = iif ? INVALID_UID : current_uid(); if (tb[RTA_SPORT]) fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]); if (tb[RTA_DPORT]) fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]); if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], &fl6.flowi6_proto, AF_INET6, extack); if (err) goto errout; } flowlabel = nla_get_be32_default(tb[RTA_FLOWLABEL], 0); fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, flowlabel); if (iif) { struct net_device *dev; int flags = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(net, iif); if (!dev) { rcu_read_unlock(); err = -ENODEV; goto errout; } fl6.flowi6_iif = iif; if (!ipv6_addr_any(&fl6.saddr)) flags |= RT6_LOOKUP_F_HAS_SADDR; dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags); rcu_read_unlock(); } else { fl6.flowi6_oif = oif; dst = ip6_route_output(net, NULL, &fl6); } rt = dst_rt6_info(dst); if (rt->dst.error) { err = rt->dst.error; ip6_rt_put(rt); goto errout; } if (rt == net->ipv6.ip6_null_entry) { err = rt->dst.error; ip6_rt_put(rt); goto errout; } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) { ip6_rt_put(rt); err = -ENOBUFS; goto errout; } skb_dst_set(skb, &rt->dst); rcu_read_lock(); from = rcu_dereference(rt->from); if (from) { if (fibmatch) err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); else err = rt6_fill_node(net, skb, from, dst, &fl6.daddr, &fl6.saddr, iif, RTM_NEWROUTE, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); } else { err = -ENETUNREACH; } rcu_read_unlock(); if (err < 0) { kfree_skb(skb); goto errout; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: return err; } void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int nlm_flags) { struct net *net = info->nl_net; struct sk_buff *skb; size_t sz; u32 seq; int err; err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; rcu_read_lock(); sz = rt6_nlmsg_size(rt); retry: skb = nlmsg_new(sz, GFP_ATOMIC); if (!skb) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, event, info->portid, seq, nlm_flags); if (err < 0) { kfree_skb(skb); /* -EMSGSIZE implies needed space grew under us. */ if (err == -EMSGSIZE) { sz = max(rt6_nlmsg_size(rt), sz << 1); goto retry; } goto errout; } rcu_read_unlock(); rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, GFP_ATOMIC); return; errout: rcu_read_unlock(); rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } void fib6_rt_update(struct net *net, struct fib6_info *rt, struct nl_info *info) { u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (!skb) goto errout; err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0, RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any()); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, bool offload, bool trap, bool offload_failed) { struct sk_buff *skb; int err; if (READ_ONCE(f6i->offload) == offload && READ_ONCE(f6i->trap) == trap && READ_ONCE(f6i->offload_failed) == offload_failed) return; WRITE_ONCE(f6i->offload, offload); WRITE_ONCE(f6i->trap, trap); /* 2 means send notifications only if offload_failed was changed. */ if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 && READ_ONCE(f6i->offload_failed) == offload_failed) return; WRITE_ONCE(f6i->offload_failed, offload_failed); if (!rcu_access_pointer(f6i->fib6_node)) /* The route was removed from the tree, do not send * notification. */ return; if (!net->ipv6.sysctl.fib_notify_on_flag_change) return; skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL); if (!skb) { err = -ENOBUFS; goto errout; } err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0, 0, 0); if (err < 0) { /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL); return; errout: rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } EXPORT_SYMBOL(fib6_info_hw_flags_set); static int ip6_route_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); if (!(dev->flags & IFF_LOOPBACK)) return NOTIFY_OK; if (event == NETDEV_REGISTER) { net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev; net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.ip6_prohibit_entry->dst.dev = dev; net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); net->ipv6.ip6_blk_hole_entry->dst.dev = dev; net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); #endif } else if (event == NETDEV_UNREGISTER && dev->reg_state != NETREG_UNREGISTERED) { /* NETDEV_UNREGISTER could be fired for multiple times by * netdev_wait_allrefs(). Make sure we only call this once. */ in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev); in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev); #endif } return NOTIFY_OK; } /* * /proc */ #ifdef CONFIG_PROC_FS static int rt6_stats_seq_show(struct seq_file *seq, void *v) { struct net *net = (struct net *)seq->private; seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", net->ipv6.rt6_stats->fib_nodes, net->ipv6.rt6_stats->fib_route_nodes, atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), net->ipv6.rt6_stats->fib_rt_entries, net->ipv6.rt6_stats->fib_rt_cache, dst_entries_get_slow(&net->ipv6.ip6_dst_ops), net->ipv6.rt6_stats->fib_discarded_routes); return 0; } #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SYSCTL static int ipv6_sysctl_rtcache_flush(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int delay; int ret; if (!write) return -EINVAL; ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (ret) return ret; net = (struct net *)ctl->extra1; delay = net->ipv6.sysctl.flush_delay; fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); return 0; } static struct ctl_table ipv6_route_table_template[] = { { .procname = "max_size", .data = &init_net.ipv6.sysctl.ip6_rt_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "gc_thresh", .data = &ip6_dst_ops_template.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "flush", .data = &init_net.ipv6.sysctl.flush_delay, .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv6_sysctl_rtcache_flush }, { .procname = "gc_min_interval", .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_timeout", .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_interval", .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_elasticity", .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "mtu_expires", .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "min_adv_mss", .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "gc_min_interval_ms", .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "skip_notify_on_dev_down", .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, }; struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) { struct ctl_table *table; table = kmemdup(ipv6_route_table_template, sizeof(ipv6_route_table_template), GFP_KERNEL); if (table) { table[0].data = &net->ipv6.sysctl.ip6_rt_max_size; table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; table[2].data = &net->ipv6.sysctl.flush_delay; table[2].extra1 = net; table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity; table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires; table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss; table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down; } return table; } size_t ipv6_route_sysctl_table_size(struct net *net) { /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) return 1; return ARRAY_SIZE(ipv6_route_table_template); } #endif static int __net_init ip6_route_net_init(struct net *net) { int ret = -ENOMEM; memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, sizeof(net->ipv6.ip6_dst_ops)); if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) goto out_ip6_dst_ops; net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true); if (!net->ipv6.fib6_null_entry) goto out_ip6_dst_entries; memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template, sizeof(*net->ipv6.fib6_null_entry)); net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), GFP_KERNEL); if (!net->ipv6.ip6_null_entry) goto out_fib6_null_entry; net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_null_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_has_custom_rules = false; net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template, sizeof(*net->ipv6.ip6_prohibit_entry), GFP_KERNEL); if (!net->ipv6.ip6_prohibit_entry) goto out_ip6_null_entry; net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached); net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template, sizeof(*net->ipv6.ip6_blk_hole_entry), GFP_KERNEL); if (!net->ipv6.ip6_blk_hole_entry) goto out_ip6_prohibit_entry; net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops; dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst, ip6_template_metrics, true); INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached); #ifdef CONFIG_IPV6_SUBTREES net->ipv6.fib6_routes_require_src = 0; #endif #endif net->ipv6.sysctl.flush_delay = 0; net->ipv6.sysctl.ip6_rt_max_size = INT_MAX; net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2; net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ; net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ; net->ipv6.sysctl.ip6_rt_gc_elasticity = 9; net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ; net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; net->ipv6.sysctl.skip_notify_on_dev_down = 0; atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ); ret = 0; out: return ret; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_ip6_prohibit_entry: kfree(net->ipv6.ip6_prohibit_entry); out_ip6_null_entry: kfree(net->ipv6.ip6_null_entry); #endif out_fib6_null_entry: kfree(net->ipv6.fib6_null_entry); out_ip6_dst_entries: dst_entries_destroy(&net->ipv6.ip6_dst_ops); out_ip6_dst_ops: goto out; } static void __net_exit ip6_route_net_exit(struct net *net) { kfree(net->ipv6.fib6_null_entry); kfree(net->ipv6.ip6_null_entry); #ifdef CONFIG_IPV6_MULTIPLE_TABLES kfree(net->ipv6.ip6_prohibit_entry); kfree(net->ipv6.ip6_blk_hole_entry); #endif dst_entries_destroy(&net->ipv6.ip6_dst_ops); } static int __net_init ip6_route_net_init_late(struct net *net) { #ifdef CONFIG_PROC_FS if (!proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops, sizeof(struct ipv6_route_iter))) return -ENOMEM; if (!proc_create_net_single("rt6_stats", 0444, net->proc_net, rt6_stats_seq_show, NULL)) { remove_proc_entry("ipv6_route", net->proc_net); return -ENOMEM; } #endif return 0; } static void __net_exit ip6_route_net_exit_late(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ipv6_route", net->proc_net); remove_proc_entry("rt6_stats", net->proc_net); #endif } static struct pernet_operations ip6_route_net_ops = { .init = ip6_route_net_init, .exit = ip6_route_net_exit, }; static int __net_init ipv6_inetpeer_init(struct net *net) { struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); if (!bp) return -ENOMEM; inet_peer_base_init(bp); net->ipv6.peers = bp; return 0; } static void __net_exit ipv6_inetpeer_exit(struct net *net) { struct inet_peer_base *bp = net->ipv6.peers; net->ipv6.peers = NULL; inetpeer_invalidate_tree(bp); kfree(bp); } static struct pernet_operations ipv6_inetpeer_ops = { .init = ipv6_inetpeer_init, .exit = ipv6_inetpeer_exit, }; static struct pernet_operations ip6_route_net_late_ops = { .init = ip6_route_net_init_late, .exit = ip6_route_net_exit_late, }; static struct notifier_block ip6_route_dev_notifier = { .notifier_call = ip6_route_dev_notify, .priority = ADDRCONF_NOTIFY_PRIORITY - 10, }; void __init ip6_route_init_special_entries(void) { /* Registering of the loopback is done before this portion of code, * the loopback reference in rt6_info will not be taken, do it * manually for init_net */ init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); #endif } #if IS_BUILTIN(CONFIG_IPV6) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) BTF_ID_LIST_SINGLE(btf_fib6_info_id, struct, fib6_info) static const struct bpf_iter_seq_info ipv6_route_seq_info = { .seq_ops = &ipv6_route_seq_ops, .init_seq_private = bpf_iter_init_seq_net, .fini_seq_private = bpf_iter_fini_seq_net, .seq_priv_size = sizeof(struct ipv6_route_iter), }; static struct bpf_iter_reg ipv6_route_reg_info = { .target = "ipv6_route", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__ipv6_route, rt), PTR_TO_BTF_ID_OR_NULL }, }, .seq_info = &ipv6_route_seq_info, }; static int __init bpf_iter_register(void) { ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id; return bpf_iter_reg_target(&ipv6_route_reg_info); } static void bpf_iter_unregister(void) { bpf_iter_unreg_target(&ipv6_route_reg_info); } #endif #endif static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE, .doit = inet6_rtm_newroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE, .doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; int __init ip6_route_init(void) { int ret; int cpu; ret = -ENOMEM; ip6_dst_ops_template.kmem_cachep = kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!ip6_dst_ops_template.kmem_cachep) goto out; ret = dst_entries_init(&ip6_dst_blackhole_ops); if (ret) goto out_kmem_cache; ret = register_pernet_subsys(&ipv6_inetpeer_ops); if (ret) goto out_dst_entries; ret = register_pernet_subsys(&ip6_route_net_ops); if (ret) goto out_register_inetpeer; ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; ret = fib6_init(); if (ret) goto out_register_subsys; ret = xfrm6_init(); if (ret) goto out_fib6_init; ret = fib6_rules_init(); if (ret) goto xfrm6_init; ret = register_pernet_subsys(&ip6_route_net_late_ops); if (ret) goto fib6_rules_init; ret = rtnl_register_many(ip6_route_rtnl_msg_handlers); if (ret < 0) goto out_register_late_subsys; ret = register_netdevice_notifier(&ip6_route_dev_notifier); if (ret) goto out_register_late_subsys; #if IS_BUILTIN(CONFIG_IPV6) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) ret = bpf_iter_register(); if (ret) goto out_register_late_subsys; #endif #endif for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); spin_lock_init(&ul->lock); } out: return ret; out_register_late_subsys: rtnl_unregister_all(PF_INET6); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_init: fib6_rules_cleanup(); xfrm6_init: xfrm6_fini(); out_fib6_init: fib6_gc_cleanup(); out_register_subsys: unregister_pernet_subsys(&ip6_route_net_ops); out_register_inetpeer: unregister_pernet_subsys(&ipv6_inetpeer_ops); out_dst_entries: dst_entries_destroy(&ip6_dst_blackhole_ops); out_kmem_cache: kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); goto out; } void ip6_route_cleanup(void) { #if IS_BUILTIN(CONFIG_IPV6) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) bpf_iter_unregister(); #endif #endif unregister_netdevice_notifier(&ip6_route_dev_notifier); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_cleanup(); xfrm6_fini(); fib6_gc_cleanup(); unregister_pernet_subsys(&ipv6_inetpeer_ops); unregister_pernet_subsys(&ip6_route_net_ops); dst_entries_destroy(&ip6_dst_blackhole_ops); kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); } |
| 9 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Public Key Signature Algorithm * * Copyright (c) 2023 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/sig.h> #include <linux/cryptouser.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/string.h> #include <net/netlink.h> #include "internal.h" static void crypto_sig_exit_tfm(struct crypto_tfm *tfm) { struct crypto_sig *sig = __crypto_sig_tfm(tfm); struct sig_alg *alg = crypto_sig_alg(sig); alg->exit(sig); } static int crypto_sig_init_tfm(struct crypto_tfm *tfm) { struct crypto_sig *sig = __crypto_sig_tfm(tfm); struct sig_alg *alg = crypto_sig_alg(sig); if (alg->exit) sig->base.exit = crypto_sig_exit_tfm; if (alg->init) return alg->init(sig); return 0; } static void crypto_sig_free_instance(struct crypto_instance *inst) { struct sig_instance *sig = sig_instance(inst); sig->free(sig); } static void __maybe_unused crypto_sig_show(struct seq_file *m, struct crypto_alg *alg) { seq_puts(m, "type : sig\n"); } static int __maybe_unused crypto_sig_report(struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_sig rsig = {}; strscpy(rsig.type, "sig", sizeof(rsig.type)); return nla_put(skb, CRYPTOCFGA_REPORT_SIG, sizeof(rsig), &rsig); } static const struct crypto_type crypto_sig_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_sig_init_tfm, .free = crypto_sig_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_sig_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_sig_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_SIG, .tfmsize = offsetof(struct crypto_sig, base), .algsize = offsetof(struct sig_alg, base), }; struct crypto_sig *crypto_alloc_sig(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_sig_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_sig); static int sig_default_sign(struct crypto_sig *tfm, const void *src, unsigned int slen, void *dst, unsigned int dlen) { return -ENOSYS; } static int sig_default_verify(struct crypto_sig *tfm, const void *src, unsigned int slen, const void *dst, unsigned int dlen) { return -ENOSYS; } static int sig_default_set_key(struct crypto_sig *tfm, const void *key, unsigned int keylen) { return -ENOSYS; } static unsigned int sig_default_size(struct crypto_sig *tfm) { return DIV_ROUND_UP_POW2(crypto_sig_keysize(tfm), BITS_PER_BYTE); } static int sig_prepare_alg(struct sig_alg *alg) { struct crypto_alg *base = &alg->base; if (!alg->sign) alg->sign = sig_default_sign; if (!alg->verify) alg->verify = sig_default_verify; if (!alg->set_priv_key) alg->set_priv_key = sig_default_set_key; if (!alg->set_pub_key) return -EINVAL; if (!alg->key_size) return -EINVAL; if (!alg->max_size) alg->max_size = sig_default_size; if (!alg->digest_size) alg->digest_size = sig_default_size; base->cra_type = &crypto_sig_type; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; base->cra_flags |= CRYPTO_ALG_TYPE_SIG; return 0; } int crypto_register_sig(struct sig_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = sig_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_sig); void crypto_unregister_sig(struct sig_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_sig); int sig_register_instance(struct crypto_template *tmpl, struct sig_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = sig_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, sig_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(sig_register_instance); int crypto_grab_sig(struct crypto_sig_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_sig_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_sig); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Public Key Signature Algorithms"); |
| 133 596 1098 20 209 1103 20 165 233 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | #ifndef IOU_ALLOC_CACHE_H #define IOU_ALLOC_CACHE_H #include <linux/io_uring_types.h> /* * Don't allow the cache to grow beyond this size. */ #define IO_ALLOC_CACHE_MAX 128 void io_alloc_cache_free(struct io_alloc_cache *cache, void (*free)(const void *)); bool io_alloc_cache_init(struct io_alloc_cache *cache, unsigned max_nr, unsigned int size, unsigned int init_bytes); void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp); static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, void *entry) { if (cache->nr_cached < cache->max_cached) { if (!kasan_mempool_poison_object(entry)) return false; cache->entries[cache->nr_cached++] = entry; return true; } return false; } static inline void *io_alloc_cache_get(struct io_alloc_cache *cache) { if (cache->nr_cached) { void *entry = cache->entries[--cache->nr_cached]; /* * If KASAN is enabled, always clear the initial bytes that * must be zeroed post alloc, in case any of them overlap * with KASAN storage. */ #if defined(CONFIG_KASAN) kasan_mempool_unpoison_object(entry, cache->elem_size); if (cache->init_clear) memset(entry, 0, cache->init_clear); #endif return entry; } return NULL; } static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp) { void *obj; obj = io_alloc_cache_get(cache); if (obj) return obj; return io_cache_alloc_new(cache, gfp); } static inline void io_cache_free(struct io_alloc_cache *cache, void *obj) { if (!io_alloc_cache_put(cache, obj)) kfree(obj); } #endif |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 | /* * Any part of this program may be used in documents licensed under * the GNU Free Documentation License, Version 1.1 or any later version * published by the Free Software Foundation. */ #ifndef _PARPORT_H_ #define _PARPORT_H_ #include <linux/jiffies.h> #include <linux/proc_fs.h> #include <linux/spinlock.h> #include <linux/wait.h> #include <linux/irqreturn.h> #include <linux/semaphore.h> #include <linux/device.h> #include <asm/ptrace.h> #include <uapi/linux/parport.h> /* Define this later. */ struct parport; struct pardevice; struct pc_parport_state { unsigned int ctr; unsigned int ecr; }; struct ax_parport_state { unsigned int ctr; unsigned int ecr; unsigned int dcsr; }; /* used by both parport_amiga and parport_mfc3 */ struct amiga_parport_state { unsigned char data; /* ciaa.prb */ unsigned char datadir; /* ciaa.ddrb */ unsigned char status; /* ciab.pra & 7 */ unsigned char statusdir;/* ciab.ddrb & 7 */ }; struct ip32_parport_state { unsigned int dcr; unsigned int ecr; }; struct parport_state { union { struct pc_parport_state pc; /* ARC has no state. */ struct ax_parport_state ax; struct amiga_parport_state amiga; /* Atari has not state. */ struct ip32_parport_state ip32; void *misc; } u; }; struct parport_operations { /* IBM PC-style virtual registers. */ void (*write_data)(struct parport *, unsigned char); unsigned char (*read_data)(struct parport *); void (*write_control)(struct parport *, unsigned char); unsigned char (*read_control)(struct parport *); unsigned char (*frob_control)(struct parport *, unsigned char mask, unsigned char val); unsigned char (*read_status)(struct parport *); /* IRQs. */ void (*enable_irq)(struct parport *); void (*disable_irq)(struct parport *); /* Data direction. */ void (*data_forward) (struct parport *); void (*data_reverse) (struct parport *); /* For core parport code. */ void (*init_state)(struct pardevice *, struct parport_state *); void (*save_state)(struct parport *, struct parport_state *); void (*restore_state)(struct parport *, struct parport_state *); /* Block read/write */ size_t (*epp_write_data) (struct parport *port, const void *buf, size_t len, int flags); size_t (*epp_read_data) (struct parport *port, void *buf, size_t len, int flags); size_t (*epp_write_addr) (struct parport *port, const void *buf, size_t len, int flags); size_t (*epp_read_addr) (struct parport *port, void *buf, size_t len, int flags); size_t (*ecp_write_data) (struct parport *port, const void *buf, size_t len, int flags); size_t (*ecp_read_data) (struct parport *port, void *buf, size_t len, int flags); size_t (*ecp_write_addr) (struct parport *port, const void *buf, size_t len, int flags); size_t (*compat_write_data) (struct parport *port, const void *buf, size_t len, int flags); size_t (*nibble_read_data) (struct parport *port, void *buf, size_t len, int flags); size_t (*byte_read_data) (struct parport *port, void *buf, size_t len, int flags); struct module *owner; }; struct parport_device_info { parport_device_class class; const char *class_name; const char *mfr; const char *model; const char *cmdset; const char *description; }; /* Each device can have two callback functions: * 1) a preemption function, called by the resource manager to request * that the driver relinquish control of the port. The driver should * return zero if it agrees to release the port, and nonzero if it * refuses. Do not call parport_release() - the kernel will do this * implicitly. * * 2) a wake-up function, called by the resource manager to tell drivers * that the port is available to be claimed. If a driver wants to use * the port, it should call parport_claim() here. */ /* A parallel port device */ struct pardevice { const char *name; struct parport *port; int daisy; int (*preempt)(void *); void (*wakeup)(void *); void *private; void (*irq_func)(void *); unsigned int flags; struct pardevice *next; struct pardevice *prev; struct device dev; bool devmodel; struct parport_state *state; /* saved status over preemption */ wait_queue_head_t wait_q; unsigned long int time; unsigned long int timeslice; volatile long int timeout; unsigned long waiting; /* long req'd for set_bit --RR */ struct pardevice *waitprev; struct pardevice *waitnext; void * sysctl_table; }; #define to_pardevice(n) container_of(n, struct pardevice, dev) /* IEEE1284 information */ /* IEEE1284 phases. These are exposed to userland through ppdev IOCTL * PP[GS]ETPHASE, so do not change existing values. */ enum ieee1284_phase { IEEE1284_PH_FWD_DATA, IEEE1284_PH_FWD_IDLE, IEEE1284_PH_TERMINATE, IEEE1284_PH_NEGOTIATION, IEEE1284_PH_HBUSY_DNA, IEEE1284_PH_REV_IDLE, IEEE1284_PH_HBUSY_DAVAIL, IEEE1284_PH_REV_DATA, IEEE1284_PH_ECP_SETUP, IEEE1284_PH_ECP_FWD_TO_REV, IEEE1284_PH_ECP_REV_TO_FWD, IEEE1284_PH_ECP_DIR_UNKNOWN, }; struct ieee1284_info { int mode; volatile enum ieee1284_phase phase; struct semaphore irq; }; /* A parallel port */ struct parport { unsigned long base; /* base address */ unsigned long base_hi; /* base address (hi - ECR) */ unsigned int size; /* IO extent */ const char *name; unsigned int modes; int irq; /* interrupt (or -1 for none) */ int dma; int muxport; /* which muxport (if any) this is */ int portnum; /* which physical parallel port (not mux) */ struct device *dev; /* Physical device associated with IO/DMA. * This may unfortulately be null if the * port has a legacy driver. */ struct device bus_dev; /* to link with the bus */ struct parport *physport; /* If this is a non-default mux parport, i.e. we're a clone of a real physical port, this is a pointer to that port. The locking is only done in the real port. For a clone port, the following structure members are meaningless: devices, cad, muxsel, waithead, waittail, flags, pdir, dev, ieee1284, *_lock. It this is a default mux parport, or there is no mux involved, this points to ourself. */ struct pardevice *devices; struct pardevice *cad; /* port owner */ int daisy; /* currently selected daisy addr */ int muxsel; /* currently selected mux port */ struct pardevice *waithead; struct pardevice *waittail; struct list_head list; struct timer_list timer; unsigned int flags; void *sysctl_table; struct parport_device_info probe_info[5]; /* 0-3 + non-IEEE1284.3 */ struct ieee1284_info ieee1284; struct parport_operations *ops; void *private_data; /* for lowlevel driver */ int number; /* port index - the `n' in `parportn' */ spinlock_t pardevice_lock; spinlock_t waitlist_lock; rwlock_t cad_lock; int spintime; atomic_t ref_count; unsigned long devflags; #define PARPORT_DEVPROC_REGISTERED 0 struct pardevice *proc_device; /* Currently register proc device */ struct list_head full_list; struct parport *slaves[3]; }; #define to_parport_dev(n) container_of(n, struct parport, bus_dev) #define DEFAULT_SPIN_TIME 500 /* us */ struct parport_driver { const char *name; void (*detach) (struct parport *); void (*match_port)(struct parport *); int (*probe)(struct pardevice *); struct device_driver driver; }; #define to_parport_driver(n) container_of(n, struct parport_driver, driver) int parport_bus_init(void); void parport_bus_exit(void); /* parport_register_port registers a new parallel port at the given address (if one does not already exist) and returns a pointer to it. This entails claiming the I/O region, IRQ and DMA. NULL is returned if initialisation fails. */ struct parport *parport_register_port(unsigned long base, int irq, int dma, struct parport_operations *ops); /* Once a registered port is ready for high-level drivers to use, the low-level driver that registered it should announce it. This will call the high-level drivers' attach() functions (after things like determining the IEEE 1284.3 topology of the port and collecting DeviceIDs). */ void parport_announce_port (struct parport *port); /* Unregister a port. */ extern void parport_remove_port(struct parport *port); /* Register a new high-level driver. */ int __must_check __parport_register_driver(struct parport_driver *, struct module *, const char *mod_name); /* * parport_register_driver must be a macro so that KBUILD_MODNAME can * be expanded */ /** * parport_register_driver - register a parallel port device driver * @driver: structure describing the driver * * This can be called by a parallel port device driver in order * to receive notifications about ports being found in the * system, as well as ports no longer available. * * The @driver structure is allocated by the caller and must not be * deallocated until after calling parport_unregister_driver(). * * If using the non device model: * The driver's attach() function may block. The port that * attach() is given will be valid for the duration of the * callback, but if the driver wants to take a copy of the * pointer it must call parport_get_port() to do so. Calling * parport_register_device() on that port will do this for you. * * The driver's detach() function may block. The port that * detach() is given will be valid for the duration of the * callback, but if the driver wants to take a copy of the * pointer it must call parport_get_port() to do so. * * * Returns 0 on success. The non device model will always succeeds. * but the new device model can fail and will return the error code. **/ #define parport_register_driver(driver) \ __parport_register_driver(driver, THIS_MODULE, KBUILD_MODNAME) /* Unregister a high-level driver. */ void parport_unregister_driver(struct parport_driver *); /** * module_parport_driver() - Helper macro for registering a modular parport driver * @__parport_driver: struct parport_driver to be used * * Helper macro for parport drivers which do not do anything special in module * init and exit. This eliminates a lot of boilerplate. Each module may only * use this macro once, and calling it replaces module_init() and module_exit(). */ #define module_parport_driver(__parport_driver) \ module_driver(__parport_driver, parport_register_driver, parport_unregister_driver) /* If parport_register_driver doesn't fit your needs, perhaps * parport_find_xxx does. */ extern struct parport *parport_find_number (int); extern struct parport *parport_find_base (unsigned long); /* generic irq handler, if it suits your needs */ extern irqreturn_t parport_irq_handler(int irq, void *dev_id); /* Reference counting for ports. */ extern struct parport *parport_get_port (struct parport *); extern void parport_put_port (struct parport *); void parport_del_port(struct parport *); struct pardev_cb { int (*preempt)(void *); void (*wakeup)(void *); void *private; void (*irq_func)(void *); unsigned int flags; }; /* * parport_register_dev_model declares that a device is connected to a * port, and tells the kernel all it needs to know. */ struct pardevice * parport_register_dev_model(struct parport *port, const char *name, const struct pardev_cb *par_dev_cb, int cnt); /* parport_unregister unlinks a device from the chain. */ extern void parport_unregister_device(struct pardevice *dev); /* parport_claim tries to gain ownership of the port for a particular driver. This may fail (return non-zero) if another driver is busy. If this driver has registered an interrupt handler, it will be enabled. */ extern int parport_claim(struct pardevice *dev); /* parport_claim_or_block is the same, but sleeps if the port cannot be claimed. Return value is 1 if it slept, 0 normally and -errno on error. */ extern int parport_claim_or_block(struct pardevice *dev); /* parport_release reverses a previous parport_claim. This can never fail, though the effects are undefined (except that they are bad) if you didn't previously own the port. Once you have released the port you should make sure that neither your code nor the hardware on the port tries to initiate any communication without first re-claiming the port. If you mess with the port state (enabling ECP for example) you should clean up before releasing the port. */ extern void parport_release(struct pardevice *dev); /** * parport_yield - relinquish a parallel port temporarily * @dev: a device on the parallel port * * This function relinquishes the port if it would be helpful to other * drivers to do so. Afterwards it tries to reclaim the port using * parport_claim(), and the return value is the same as for * parport_claim(). If it fails, the port is left unclaimed and it is * the driver's responsibility to reclaim the port. * * The parport_yield() and parport_yield_blocking() functions are for * marking points in the driver at which other drivers may claim the * port and use their devices. Yielding the port is similar to * releasing it and reclaiming it, but is more efficient because no * action is taken if there are no other devices needing the port. In * fact, nothing is done even if there are other devices waiting but * the current device is still within its "timeslice". The default * timeslice is half a second, but it can be adjusted via the /proc * interface. **/ static __inline__ int parport_yield(struct pardevice *dev) { unsigned long int timeslip = (jiffies - dev->time); if ((dev->port->waithead == NULL) || (timeslip < dev->timeslice)) return 0; parport_release(dev); return parport_claim(dev); } /** * parport_yield_blocking - relinquish a parallel port temporarily * @dev: a device on the parallel port * * This function relinquishes the port if it would be helpful to other * drivers to do so. Afterwards it tries to reclaim the port using * parport_claim_or_block(), and the return value is the same as for * parport_claim_or_block(). **/ static __inline__ int parport_yield_blocking(struct pardevice *dev) { unsigned long int timeslip = (jiffies - dev->time); if ((dev->port->waithead == NULL) || (timeslip < dev->timeslice)) return 0; parport_release(dev); return parport_claim_or_block(dev); } /* Flags used to identify what a device does. */ #define PARPORT_DEV_TRAN 0 /* WARNING !! DEPRECATED !! */ #define PARPORT_DEV_LURK (1<<0) /* WARNING !! DEPRECATED !! */ #define PARPORT_DEV_EXCL (1<<1) /* Need exclusive access. */ #define PARPORT_FLAG_EXCL (1<<1) /* EXCL driver registered. */ /* IEEE1284 functions */ extern void parport_ieee1284_interrupt (void *); extern int parport_negotiate (struct parport *, int mode); extern ssize_t parport_write (struct parport *, const void *buf, size_t len); extern ssize_t parport_read (struct parport *, void *buf, size_t len); #define PARPORT_INACTIVITY_O_NONBLOCK 1 extern long parport_set_timeout (struct pardevice *, long inactivity); extern int parport_wait_event (struct parport *, long timeout); extern int parport_wait_peripheral (struct parport *port, unsigned char mask, unsigned char val); extern int parport_poll_peripheral (struct parport *port, unsigned char mask, unsigned char val, int usec); /* For architectural drivers */ extern size_t parport_ieee1284_write_compat (struct parport *, const void *, size_t, int); extern size_t parport_ieee1284_read_nibble (struct parport *, void *, size_t, int); extern size_t parport_ieee1284_read_byte (struct parport *, void *, size_t, int); extern size_t parport_ieee1284_ecp_read_data (struct parport *, void *, size_t, int); extern size_t parport_ieee1284_ecp_write_data (struct parport *, const void *, size_t, int); extern size_t parport_ieee1284_ecp_write_addr (struct parport *, const void *, size_t, int); extern size_t parport_ieee1284_epp_write_data (struct parport *, const void *, size_t, int); extern size_t parport_ieee1284_epp_read_data (struct parport *, void *, size_t, int); extern size_t parport_ieee1284_epp_write_addr (struct parport *, const void *, size_t, int); extern size_t parport_ieee1284_epp_read_addr (struct parport *, void *, size_t, int); /* IEEE1284.3 functions */ #define daisy_dev_name "Device ID probe" extern int parport_daisy_init (struct parport *port); extern void parport_daisy_fini (struct parport *port); extern struct pardevice *parport_open (int devnum, const char *name); extern void parport_close (struct pardevice *dev); extern ssize_t parport_device_id (int devnum, char *buffer, size_t len); extern void parport_daisy_deselect_all (struct parport *port); extern int parport_daisy_select (struct parport *port, int daisy, int mode); /* Lowlevel drivers _can_ call this support function to handle irqs. */ static inline void parport_generic_irq(struct parport *port) { parport_ieee1284_interrupt (port); read_lock(&port->cad_lock); if (port->cad && port->cad->irq_func) port->cad->irq_func(port->cad->private); read_unlock(&port->cad_lock); } /* Prototypes from parport_procfs */ extern int parport_proc_register(struct parport *pp); extern int parport_proc_unregister(struct parport *pp); extern int parport_device_proc_register(struct pardevice *device); extern int parport_device_proc_unregister(struct pardevice *device); /* If PC hardware is the only type supported, we can optimise a bit. */ #if !defined(CONFIG_PARPORT_NOT_PC) && defined(CONFIG_PARPORT_PC) #include <linux/parport_pc.h> #define parport_write_data(p,x) parport_pc_write_data(p,x) #define parport_read_data(p) parport_pc_read_data(p) #define parport_write_control(p,x) parport_pc_write_control(p,x) #define parport_read_control(p) parport_pc_read_control(p) #define parport_frob_control(p,m,v) parport_pc_frob_control(p,m,v) #define parport_read_status(p) parport_pc_read_status(p) #define parport_enable_irq(p) parport_pc_enable_irq(p) #define parport_disable_irq(p) parport_pc_disable_irq(p) #define parport_data_forward(p) parport_pc_data_forward(p) #define parport_data_reverse(p) parport_pc_data_reverse(p) #else /* !CONFIG_PARPORT_NOT_PC */ /* Generic operations vector through the dispatch table. */ #define parport_write_data(p,x) (p)->ops->write_data(p,x) #define parport_read_data(p) (p)->ops->read_data(p) #define parport_write_control(p,x) (p)->ops->write_control(p,x) #define parport_read_control(p) (p)->ops->read_control(p) #define parport_frob_control(p,m,v) (p)->ops->frob_control(p,m,v) #define parport_read_status(p) (p)->ops->read_status(p) #define parport_enable_irq(p) (p)->ops->enable_irq(p) #define parport_disable_irq(p) (p)->ops->disable_irq(p) #define parport_data_forward(p) (p)->ops->data_forward(p) #define parport_data_reverse(p) (p)->ops->data_reverse(p) #endif /* !CONFIG_PARPORT_NOT_PC */ extern unsigned long parport_default_timeslice; extern int parport_default_spintime; #endif /* _PARPORT_H_ */ |
| 2 2 2 2 2 1 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 | // SPDX-License-Identifier: GPL-2.0+ /* * Copyright 2019 Hans de Goede <hdegoede@redhat.com> */ #include <linux/module.h> #include <linux/pm.h> #include <linux/usb.h> #include <drm/clients/drm_client_setup.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_atomic_state_helper.h> #include <drm/drm_connector.h> #include <drm/drm_damage_helper.h> #include <drm/drm_drv.h> #include <drm/drm_edid.h> #include <drm/drm_fbdev_shmem.h> #include <drm/drm_file.h> #include <drm/drm_format_helper.h> #include <drm/drm_fourcc.h> #include <drm/drm_framebuffer.h> #include <drm/drm_gem_atomic_helper.h> #include <drm/drm_gem_framebuffer_helper.h> #include <drm/drm_gem_shmem_helper.h> #include <drm/drm_ioctl.h> #include <drm/drm_managed.h> #include <drm/drm_modeset_helper_vtables.h> #include <drm/drm_probe_helper.h> #include <drm/drm_simple_kms_helper.h> static bool eco_mode; module_param(eco_mode, bool, 0644); MODULE_PARM_DESC(eco_mode, "Turn on Eco mode (less bright, more silent)"); #define DRIVER_NAME "gm12u320" #define DRIVER_DESC "Grain Media GM12U320 USB projector display" #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0 /* * The DLP has an actual width of 854 pixels, but that is not a multiple * of 8, breaking things left and right, so we export a width of 848. */ #define GM12U320_USER_WIDTH 848 #define GM12U320_REAL_WIDTH 854 #define GM12U320_HEIGHT 480 #define GM12U320_BLOCK_COUNT 20 #define GM12U320_ERR(fmt, ...) \ DRM_DEV_ERROR(gm12u320->dev.dev, fmt, ##__VA_ARGS__) #define MISC_RCV_EPT 1 #define DATA_RCV_EPT 2 #define DATA_SND_EPT 3 #define MISC_SND_EPT 4 #define DATA_BLOCK_HEADER_SIZE 84 #define DATA_BLOCK_CONTENT_SIZE 64512 #define DATA_BLOCK_FOOTER_SIZE 20 #define DATA_BLOCK_SIZE (DATA_BLOCK_HEADER_SIZE + \ DATA_BLOCK_CONTENT_SIZE + \ DATA_BLOCK_FOOTER_SIZE) #define DATA_LAST_BLOCK_CONTENT_SIZE 4032 #define DATA_LAST_BLOCK_SIZE (DATA_BLOCK_HEADER_SIZE + \ DATA_LAST_BLOCK_CONTENT_SIZE + \ DATA_BLOCK_FOOTER_SIZE) #define CMD_SIZE 31 #define READ_STATUS_SIZE 13 #define MISC_VALUE_SIZE 4 #define CMD_TIMEOUT 200 #define DATA_TIMEOUT 1000 #define IDLE_TIMEOUT 2000 #define FIRST_FRAME_TIMEOUT 2000 #define MISC_REQ_GET_SET_ECO_A 0xff #define MISC_REQ_GET_SET_ECO_B 0x35 /* Windows driver does once every second, with arg d = 1, other args 0 */ #define MISC_REQ_UNKNOWN1_A 0xff #define MISC_REQ_UNKNOWN1_B 0x38 /* Windows driver does this on init, with arg a, b = 0, c = 0xa0, d = 4 */ #define MISC_REQ_UNKNOWN2_A 0xa5 #define MISC_REQ_UNKNOWN2_B 0x00 struct gm12u320_device { struct drm_device dev; struct drm_simple_display_pipe pipe; struct drm_connector conn; unsigned char *cmd_buf; unsigned char *data_buf[GM12U320_BLOCK_COUNT]; struct { struct delayed_work work; struct mutex lock; struct drm_framebuffer *fb; struct drm_rect rect; int frame; int draw_status_timeout; struct iosys_map src_map; } fb_update; }; #define to_gm12u320(__dev) container_of(__dev, struct gm12u320_device, dev) static const char cmd_data[CMD_SIZE] = { 0x55, 0x53, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x68, 0xfc, 0x00, 0x00, 0x00, 0x00, 0x10, 0xff, 0x00, 0x00, 0x00, 0x00, 0xfc, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; static const char cmd_draw[CMD_SIZE] = { 0x55, 0x53, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0xfe, 0x00, 0x00, 0x00, 0xc0, 0xd1, 0x05, 0x00, 0x40, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 }; static const char cmd_misc[CMD_SIZE] = { 0x55, 0x53, 0x42, 0x43, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x80, 0x01, 0x10, 0xfd, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; static const char data_block_header[DATA_BLOCK_HEADER_SIZE] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfb, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x15, 0x00, 0x00, 0xfc, 0x00, 0x00, 0x01, 0x00, 0x00, 0xdb }; static const char data_last_block_header[DATA_BLOCK_HEADER_SIZE] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfb, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2a, 0x00, 0x20, 0x00, 0xc0, 0x0f, 0x00, 0x00, 0x01, 0x00, 0x00, 0xd7 }; static const char data_block_footer[DATA_BLOCK_FOOTER_SIZE] = { 0xfb, 0x14, 0x02, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x4f }; static inline struct usb_device *gm12u320_to_usb_device(struct gm12u320_device *gm12u320) { return interface_to_usbdev(to_usb_interface(gm12u320->dev.dev)); } static int gm12u320_usb_alloc(struct gm12u320_device *gm12u320) { int i, block_size; const char *hdr; gm12u320->cmd_buf = drmm_kmalloc(&gm12u320->dev, CMD_SIZE, GFP_KERNEL); if (!gm12u320->cmd_buf) return -ENOMEM; for (i = 0; i < GM12U320_BLOCK_COUNT; i++) { if (i == GM12U320_BLOCK_COUNT - 1) { block_size = DATA_LAST_BLOCK_SIZE; hdr = data_last_block_header; } else { block_size = DATA_BLOCK_SIZE; hdr = data_block_header; } gm12u320->data_buf[i] = drmm_kzalloc(&gm12u320->dev, block_size, GFP_KERNEL); if (!gm12u320->data_buf[i]) return -ENOMEM; memcpy(gm12u320->data_buf[i], hdr, DATA_BLOCK_HEADER_SIZE); memcpy(gm12u320->data_buf[i] + (block_size - DATA_BLOCK_FOOTER_SIZE), data_block_footer, DATA_BLOCK_FOOTER_SIZE); } return 0; } static int gm12u320_misc_request(struct gm12u320_device *gm12u320, u8 req_a, u8 req_b, u8 arg_a, u8 arg_b, u8 arg_c, u8 arg_d) { struct usb_device *udev = gm12u320_to_usb_device(gm12u320); int ret, len; memcpy(gm12u320->cmd_buf, &cmd_misc, CMD_SIZE); gm12u320->cmd_buf[20] = req_a; gm12u320->cmd_buf[21] = req_b; gm12u320->cmd_buf[22] = arg_a; gm12u320->cmd_buf[23] = arg_b; gm12u320->cmd_buf[24] = arg_c; gm12u320->cmd_buf[25] = arg_d; /* Send request */ ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, MISC_SND_EPT), gm12u320->cmd_buf, CMD_SIZE, &len, CMD_TIMEOUT); if (ret || len != CMD_SIZE) { GM12U320_ERR("Misc. req. error %d\n", ret); return -EIO; } /* Read value */ ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, MISC_RCV_EPT), gm12u320->cmd_buf, MISC_VALUE_SIZE, &len, DATA_TIMEOUT); if (ret || len != MISC_VALUE_SIZE) { GM12U320_ERR("Misc. value error %d\n", ret); return -EIO; } /* cmd_buf[0] now contains the read value, which we don't use */ /* Read status */ ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, MISC_RCV_EPT), gm12u320->cmd_buf, READ_STATUS_SIZE, &len, CMD_TIMEOUT); if (ret || len != READ_STATUS_SIZE) { GM12U320_ERR("Misc. status error %d\n", ret); return -EIO; } return 0; } static void gm12u320_32bpp_to_24bpp_packed(u8 *dst, u8 *src, int len) { while (len--) { *dst++ = *src++; *dst++ = *src++; *dst++ = *src++; src++; } } static void gm12u320_copy_fb_to_blocks(struct gm12u320_device *gm12u320) { int block, dst_offset, len, remain, ret, x1, x2, y1, y2; struct drm_framebuffer *fb; void *vaddr; u8 *src; mutex_lock(&gm12u320->fb_update.lock); if (!gm12u320->fb_update.fb) goto unlock; fb = gm12u320->fb_update.fb; x1 = gm12u320->fb_update.rect.x1; x2 = gm12u320->fb_update.rect.x2; y1 = gm12u320->fb_update.rect.y1; y2 = gm12u320->fb_update.rect.y2; vaddr = gm12u320->fb_update.src_map.vaddr; /* TODO: Use mapping abstraction properly */ ret = drm_gem_fb_begin_cpu_access(fb, DMA_FROM_DEVICE); if (ret) { GM12U320_ERR("drm_gem_fb_begin_cpu_access err: %d\n", ret); goto put_fb; } src = vaddr + y1 * fb->pitches[0] + x1 * 4; x1 += (GM12U320_REAL_WIDTH - GM12U320_USER_WIDTH) / 2; x2 += (GM12U320_REAL_WIDTH - GM12U320_USER_WIDTH) / 2; for (; y1 < y2; y1++) { remain = 0; len = (x2 - x1) * 3; dst_offset = (y1 * GM12U320_REAL_WIDTH + x1) * 3; block = dst_offset / DATA_BLOCK_CONTENT_SIZE; dst_offset %= DATA_BLOCK_CONTENT_SIZE; if ((dst_offset + len) > DATA_BLOCK_CONTENT_SIZE) { remain = dst_offset + len - DATA_BLOCK_CONTENT_SIZE; len = DATA_BLOCK_CONTENT_SIZE - dst_offset; } dst_offset += DATA_BLOCK_HEADER_SIZE; len /= 3; gm12u320_32bpp_to_24bpp_packed( gm12u320->data_buf[block] + dst_offset, src, len); if (remain) { block++; dst_offset = DATA_BLOCK_HEADER_SIZE; gm12u320_32bpp_to_24bpp_packed( gm12u320->data_buf[block] + dst_offset, src + len * 4, remain / 3); } src += fb->pitches[0]; } drm_gem_fb_end_cpu_access(fb, DMA_FROM_DEVICE); put_fb: drm_framebuffer_put(fb); gm12u320->fb_update.fb = NULL; unlock: mutex_unlock(&gm12u320->fb_update.lock); } static void gm12u320_fb_update_work(struct work_struct *work) { struct gm12u320_device *gm12u320 = container_of(to_delayed_work(work), struct gm12u320_device, fb_update.work); struct usb_device *udev = gm12u320_to_usb_device(gm12u320); int block, block_size, len; int ret = 0; gm12u320_copy_fb_to_blocks(gm12u320); for (block = 0; block < GM12U320_BLOCK_COUNT; block++) { if (block == GM12U320_BLOCK_COUNT - 1) block_size = DATA_LAST_BLOCK_SIZE; else block_size = DATA_BLOCK_SIZE; /* Send data command to device */ memcpy(gm12u320->cmd_buf, cmd_data, CMD_SIZE); gm12u320->cmd_buf[8] = block_size & 0xff; gm12u320->cmd_buf[9] = block_size >> 8; gm12u320->cmd_buf[20] = 0xfc - block * 4; gm12u320->cmd_buf[21] = block | (gm12u320->fb_update.frame << 7); ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, DATA_SND_EPT), gm12u320->cmd_buf, CMD_SIZE, &len, CMD_TIMEOUT); if (ret || len != CMD_SIZE) goto err; /* Send data block to device */ ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, DATA_SND_EPT), gm12u320->data_buf[block], block_size, &len, DATA_TIMEOUT); if (ret || len != block_size) goto err; /* Read status */ ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, DATA_RCV_EPT), gm12u320->cmd_buf, READ_STATUS_SIZE, &len, CMD_TIMEOUT); if (ret || len != READ_STATUS_SIZE) goto err; } /* Send draw command to device */ memcpy(gm12u320->cmd_buf, cmd_draw, CMD_SIZE); ret = usb_bulk_msg(udev, usb_sndbulkpipe(udev, DATA_SND_EPT), gm12u320->cmd_buf, CMD_SIZE, &len, CMD_TIMEOUT); if (ret || len != CMD_SIZE) goto err; /* Read status */ ret = usb_bulk_msg(udev, usb_rcvbulkpipe(udev, DATA_RCV_EPT), gm12u320->cmd_buf, READ_STATUS_SIZE, &len, gm12u320->fb_update.draw_status_timeout); if (ret || len != READ_STATUS_SIZE) goto err; gm12u320->fb_update.draw_status_timeout = CMD_TIMEOUT; gm12u320->fb_update.frame = !gm12u320->fb_update.frame; /* * We must draw a frame every 2s otherwise the projector * switches back to showing its logo. */ queue_delayed_work(system_long_wq, &gm12u320->fb_update.work, msecs_to_jiffies(IDLE_TIMEOUT)); return; err: /* Do not log errors caused by module unload or device unplug */ if (ret != -ENODEV && ret != -ECONNRESET && ret != -ESHUTDOWN) GM12U320_ERR("Frame update error: %d\n", ret); } static void gm12u320_fb_mark_dirty(struct drm_framebuffer *fb, const struct iosys_map *map, struct drm_rect *dirty) { struct gm12u320_device *gm12u320 = to_gm12u320(fb->dev); struct drm_framebuffer *old_fb = NULL; bool wakeup = false; mutex_lock(&gm12u320->fb_update.lock); if (gm12u320->fb_update.fb != fb) { old_fb = gm12u320->fb_update.fb; drm_framebuffer_get(fb); gm12u320->fb_update.fb = fb; gm12u320->fb_update.rect = *dirty; gm12u320->fb_update.src_map = *map; wakeup = true; } else { struct drm_rect *rect = &gm12u320->fb_update.rect; rect->x1 = min(rect->x1, dirty->x1); rect->y1 = min(rect->y1, dirty->y1); rect->x2 = max(rect->x2, dirty->x2); rect->y2 = max(rect->y2, dirty->y2); } mutex_unlock(&gm12u320->fb_update.lock); if (wakeup) mod_delayed_work(system_long_wq, &gm12u320->fb_update.work, 0); if (old_fb) drm_framebuffer_put(old_fb); } static void gm12u320_stop_fb_update(struct gm12u320_device *gm12u320) { struct drm_framebuffer *old_fb; cancel_delayed_work_sync(&gm12u320->fb_update.work); mutex_lock(&gm12u320->fb_update.lock); old_fb = gm12u320->fb_update.fb; gm12u320->fb_update.fb = NULL; iosys_map_clear(&gm12u320->fb_update.src_map); mutex_unlock(&gm12u320->fb_update.lock); drm_framebuffer_put(old_fb); } static int gm12u320_set_ecomode(struct gm12u320_device *gm12u320) { return gm12u320_misc_request(gm12u320, MISC_REQ_GET_SET_ECO_A, MISC_REQ_GET_SET_ECO_B, 0x01 /* set */, eco_mode ? 0x01 : 0x00, 0x00, 0x01); } /* ------------------------------------------------------------------ */ /* gm12u320 connector */ /* *Â We use fake EDID info so that userspace know that it is dealing with * an Acer projector, rather then listing this as an "unknown" monitor. * Note this assumes this driver is only ever used with the Acer C120, if we * add support for other devices the vendor and model should be parameterized. */ static const struct edid gm12u320_edid = { .header = { 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 }, .mfg_id = { 0x04, 0x72 }, /* "ACR" */ .prod_code = { 0x20, 0xc1 }, /* C120h */ .serial = 0xaa55aa55, .mfg_week = 1, .mfg_year = 16, .version = 1, /* EDID 1.3 */ .revision = 3, /* EDID 1.3 */ .input = 0x08, /* Analog input */ .features = 0x0a, /* Pref timing in DTD 1 */ .standard_timings = { { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 } }, .detailed_timings = { { .pixel_clock = 3383, /* hactive = 848, hblank = 256 */ .data.pixel_data.hactive_lo = 0x50, .data.pixel_data.hblank_lo = 0x00, .data.pixel_data.hactive_hblank_hi = 0x31, /* vactive = 480, vblank = 28 */ .data.pixel_data.vactive_lo = 0xe0, .data.pixel_data.vblank_lo = 0x1c, .data.pixel_data.vactive_vblank_hi = 0x10, /* hsync offset 40 pw 128, vsync offset 1 pw 4 */ .data.pixel_data.hsync_offset_lo = 0x28, .data.pixel_data.hsync_pulse_width_lo = 0x80, .data.pixel_data.vsync_offset_pulse_width_lo = 0x14, .data.pixel_data.hsync_vsync_offset_pulse_width_hi = 0x00, /* Digital separate syncs, hsync+, vsync+ */ .data.pixel_data.misc = 0x1e, }, { .pixel_clock = 0, .data.other_data.type = 0xfd, /* Monitor ranges */ .data.other_data.data.range.min_vfreq = 59, .data.other_data.data.range.max_vfreq = 61, .data.other_data.data.range.min_hfreq_khz = 29, .data.other_data.data.range.max_hfreq_khz = 32, .data.other_data.data.range.pixel_clock_mhz = 4, /* 40 MHz */ .data.other_data.data.range.flags = 0, .data.other_data.data.range.formula.cvt = { 0xa0, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20 }, }, { .pixel_clock = 0, .data.other_data.type = 0xfc, /* Model string */ .data.other_data.data.str.str = { 'P', 'r', 'o', 'j', 'e', 'c', 't', 'o', 'r', '\n', ' ', ' ', ' ' }, }, { .pixel_clock = 0, .data.other_data.type = 0xfe, /* Unspecified text / padding */ .data.other_data.data.str.str = { '\n', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ' }, } }, .checksum = 0x13, }; static int gm12u320_conn_get_modes(struct drm_connector *connector) { const struct drm_edid *drm_edid; int count; drm_edid = drm_edid_alloc(&gm12u320_edid, sizeof(gm12u320_edid)); drm_edid_connector_update(connector, drm_edid); count = drm_edid_connector_add_modes(connector); drm_edid_free(drm_edid); return count; } static const struct drm_connector_helper_funcs gm12u320_conn_helper_funcs = { .get_modes = gm12u320_conn_get_modes, }; static const struct drm_connector_funcs gm12u320_conn_funcs = { .fill_modes = drm_helper_probe_single_connector_modes, .destroy = drm_connector_cleanup, .reset = drm_atomic_helper_connector_reset, .atomic_duplicate_state = drm_atomic_helper_connector_duplicate_state, .atomic_destroy_state = drm_atomic_helper_connector_destroy_state, }; static int gm12u320_conn_init(struct gm12u320_device *gm12u320) { drm_connector_helper_add(&gm12u320->conn, &gm12u320_conn_helper_funcs); return drm_connector_init(&gm12u320->dev, &gm12u320->conn, &gm12u320_conn_funcs, DRM_MODE_CONNECTOR_VGA); } /* ------------------------------------------------------------------ */ /* gm12u320 (simple) display pipe */ static void gm12u320_pipe_enable(struct drm_simple_display_pipe *pipe, struct drm_crtc_state *crtc_state, struct drm_plane_state *plane_state) { struct drm_rect rect = { 0, 0, GM12U320_USER_WIDTH, GM12U320_HEIGHT }; struct gm12u320_device *gm12u320 = to_gm12u320(pipe->crtc.dev); struct drm_shadow_plane_state *shadow_plane_state = to_drm_shadow_plane_state(plane_state); gm12u320->fb_update.draw_status_timeout = FIRST_FRAME_TIMEOUT; gm12u320_fb_mark_dirty(plane_state->fb, &shadow_plane_state->data[0], &rect); } static void gm12u320_pipe_disable(struct drm_simple_display_pipe *pipe) { struct gm12u320_device *gm12u320 = to_gm12u320(pipe->crtc.dev); gm12u320_stop_fb_update(gm12u320); } static void gm12u320_pipe_update(struct drm_simple_display_pipe *pipe, struct drm_plane_state *old_state) { struct drm_plane_state *state = pipe->plane.state; struct drm_shadow_plane_state *shadow_plane_state = to_drm_shadow_plane_state(state); struct drm_rect rect; if (drm_atomic_helper_damage_merged(old_state, state, &rect)) gm12u320_fb_mark_dirty(state->fb, &shadow_plane_state->data[0], &rect); } static const struct drm_simple_display_pipe_funcs gm12u320_pipe_funcs = { .enable = gm12u320_pipe_enable, .disable = gm12u320_pipe_disable, .update = gm12u320_pipe_update, DRM_GEM_SIMPLE_DISPLAY_PIPE_SHADOW_PLANE_FUNCS, }; static const uint32_t gm12u320_pipe_formats[] = { DRM_FORMAT_XRGB8888, }; static const uint64_t gm12u320_pipe_modifiers[] = { DRM_FORMAT_MOD_LINEAR, DRM_FORMAT_MOD_INVALID }; DEFINE_DRM_GEM_FOPS(gm12u320_fops); static const struct drm_driver gm12u320_drm_driver = { .driver_features = DRIVER_MODESET | DRIVER_GEM | DRIVER_ATOMIC, .name = DRIVER_NAME, .desc = DRIVER_DESC, .major = DRIVER_MAJOR, .minor = DRIVER_MINOR, .fops = &gm12u320_fops, DRM_GEM_SHMEM_DRIVER_OPS, DRM_FBDEV_SHMEM_DRIVER_OPS, }; static const struct drm_mode_config_funcs gm12u320_mode_config_funcs = { .fb_create = drm_gem_fb_create_with_dirty, .atomic_check = drm_atomic_helper_check, .atomic_commit = drm_atomic_helper_commit, }; static int gm12u320_usb_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct gm12u320_device *gm12u320; struct drm_device *dev; struct device *dma_dev; int ret; /* * The gm12u320 presents itself to the system as 2 usb mass-storage * interfaces, we only care about / need the first one. */ if (interface->cur_altsetting->desc.bInterfaceNumber != 0) return -ENODEV; gm12u320 = devm_drm_dev_alloc(&interface->dev, &gm12u320_drm_driver, struct gm12u320_device, dev); if (IS_ERR(gm12u320)) return PTR_ERR(gm12u320); dev = &gm12u320->dev; dma_dev = usb_intf_get_dma_device(interface); if (dma_dev) { drm_dev_set_dma_dev(dev, dma_dev); put_device(dma_dev); } else { drm_warn(dev, "buffer sharing not supported"); /* not an error */ } INIT_DELAYED_WORK(&gm12u320->fb_update.work, gm12u320_fb_update_work); mutex_init(&gm12u320->fb_update.lock); ret = drmm_mode_config_init(dev); if (ret) return ret; dev->mode_config.min_width = GM12U320_USER_WIDTH; dev->mode_config.max_width = GM12U320_USER_WIDTH; dev->mode_config.min_height = GM12U320_HEIGHT; dev->mode_config.max_height = GM12U320_HEIGHT; dev->mode_config.funcs = &gm12u320_mode_config_funcs; ret = gm12u320_usb_alloc(gm12u320); if (ret) return ret; ret = gm12u320_set_ecomode(gm12u320); if (ret) return ret; ret = gm12u320_conn_init(gm12u320); if (ret) return ret; ret = drm_simple_display_pipe_init(&gm12u320->dev, &gm12u320->pipe, &gm12u320_pipe_funcs, gm12u320_pipe_formats, ARRAY_SIZE(gm12u320_pipe_formats), gm12u320_pipe_modifiers, &gm12u320->conn); if (ret) return ret; drm_mode_config_reset(dev); usb_set_intfdata(interface, dev); ret = drm_dev_register(dev, 0); if (ret) return ret; drm_client_setup(dev, NULL); return 0; } static void gm12u320_usb_disconnect(struct usb_interface *interface) { struct drm_device *dev = usb_get_intfdata(interface); drm_dev_unplug(dev); drm_atomic_helper_shutdown(dev); } static int gm12u320_suspend(struct usb_interface *interface, pm_message_t message) { struct drm_device *dev = usb_get_intfdata(interface); return drm_mode_config_helper_suspend(dev); } static int gm12u320_resume(struct usb_interface *interface) { struct drm_device *dev = usb_get_intfdata(interface); struct gm12u320_device *gm12u320 = to_gm12u320(dev); gm12u320_set_ecomode(gm12u320); return drm_mode_config_helper_resume(dev); } static const struct usb_device_id id_table[] = { { USB_DEVICE(0x1de1, 0xc102) }, {}, }; MODULE_DEVICE_TABLE(usb, id_table); static struct usb_driver gm12u320_usb_driver = { .name = "gm12u320", .probe = gm12u320_usb_probe, .disconnect = gm12u320_usb_disconnect, .id_table = id_table, .suspend = pm_ptr(gm12u320_suspend), .resume = pm_ptr(gm12u320_resume), .reset_resume = pm_ptr(gm12u320_resume), }; module_usb_driver(gm12u320_usb_driver); MODULE_AUTHOR("Hans de Goede <hdegoede@redhat.com>"); MODULE_DESCRIPTION("GM12U320 driver for USB projectors"); MODULE_LICENSE("GPL"); |
| 18 18 17 7 1 1 1 1 1 3 53 23 27 1 1 2 14 14 14 2 2 2 1 1 2 2 1 5 4 1 4 4 4 4 1 1 1 1 2 21 1 1 16 3 3 17 12 8 2 15 15 15 1 14 2 15 9 3 3 6 11 106 4 1 1 1 1 54 1 15 6 5 1 26 12 12 7 3 4 5 2 4 3 7 7 7 7 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 | // SPDX-License-Identifier: GPL-2.0-only /* * cec-api.c - HDMI Consumer Electronics Control framework - API * * Copyright 2016 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/init.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/ktime.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/types.h> #include <linux/uaccess.h> #include <linux/version.h> #include <media/cec-pin.h> #include "cec-priv.h" #include "cec-pin-priv.h" static inline struct cec_devnode *cec_devnode_data(struct file *filp) { struct cec_fh *fh = filp->private_data; return &fh->adap->devnode; } /* CEC file operations */ static __poll_t cec_poll(struct file *filp, struct poll_table_struct *poll) { struct cec_fh *fh = filp->private_data; struct cec_adapter *adap = fh->adap; __poll_t res = 0; poll_wait(filp, &fh->wait, poll); if (!cec_is_registered(adap)) return EPOLLERR | EPOLLHUP | EPOLLPRI; mutex_lock(&adap->lock); if (adap->is_configured && adap->transmit_queue_sz < CEC_MAX_MSG_TX_QUEUE_SZ) res |= EPOLLOUT | EPOLLWRNORM; if (fh->queued_msgs) res |= EPOLLIN | EPOLLRDNORM; if (fh->total_queued_events) res |= EPOLLPRI; mutex_unlock(&adap->lock); return res; } static bool cec_is_busy(const struct cec_adapter *adap, const struct cec_fh *fh) { bool valid_initiator = adap->cec_initiator && adap->cec_initiator == fh; bool valid_follower = adap->cec_follower && adap->cec_follower == fh; /* * Exclusive initiators and followers can always access the CEC adapter */ if (valid_initiator || valid_follower) return false; /* * All others can only access the CEC adapter if there is no * exclusive initiator and they are in INITIATOR mode. */ return adap->cec_initiator || fh->mode_initiator == CEC_MODE_NO_INITIATOR; } static long cec_adap_g_caps(struct cec_adapter *adap, struct cec_caps __user *parg) { struct cec_caps caps = {}; strscpy(caps.driver, adap->devnode.dev.parent->driver->name, sizeof(caps.driver)); strscpy(caps.name, adap->name, sizeof(caps.name)); caps.available_log_addrs = adap->available_log_addrs; caps.capabilities = adap->capabilities; caps.version = LINUX_VERSION_CODE; if (copy_to_user(parg, &caps, sizeof(caps))) return -EFAULT; return 0; } static long cec_adap_g_phys_addr(struct cec_adapter *adap, __u16 __user *parg) { u16 phys_addr; mutex_lock(&adap->lock); phys_addr = adap->phys_addr; mutex_unlock(&adap->lock); if (copy_to_user(parg, &phys_addr, sizeof(phys_addr))) return -EFAULT; return 0; } static int cec_validate_phys_addr(u16 phys_addr) { int i; if (phys_addr == CEC_PHYS_ADDR_INVALID) return 0; for (i = 0; i < 16; i += 4) if (phys_addr & (0xf << i)) break; if (i == 16) return 0; for (i += 4; i < 16; i += 4) if ((phys_addr & (0xf << i)) == 0) return -EINVAL; return 0; } static long cec_adap_s_phys_addr(struct cec_adapter *adap, struct cec_fh *fh, bool block, __u16 __user *parg) { u16 phys_addr; long err; if (!(adap->capabilities & CEC_CAP_PHYS_ADDR)) return -ENOTTY; if (copy_from_user(&phys_addr, parg, sizeof(phys_addr))) return -EFAULT; err = cec_validate_phys_addr(phys_addr); if (err) return err; mutex_lock(&adap->lock); if (cec_is_busy(adap, fh)) err = -EBUSY; else __cec_s_phys_addr(adap, phys_addr, block); mutex_unlock(&adap->lock); return err; } static long cec_adap_g_log_addrs(struct cec_adapter *adap, struct cec_log_addrs __user *parg) { struct cec_log_addrs log_addrs; mutex_lock(&adap->lock); /* * We use memcpy here instead of assignment since there is a * hole at the end of struct cec_log_addrs that an assignment * might ignore. So when we do copy_to_user() we could leak * one byte of memory. */ memcpy(&log_addrs, &adap->log_addrs, sizeof(log_addrs)); if (!adap->is_configured) memset(log_addrs.log_addr, CEC_LOG_ADDR_INVALID, sizeof(log_addrs.log_addr)); mutex_unlock(&adap->lock); if (copy_to_user(parg, &log_addrs, sizeof(log_addrs))) return -EFAULT; return 0; } static long cec_adap_s_log_addrs(struct cec_adapter *adap, struct cec_fh *fh, bool block, struct cec_log_addrs __user *parg) { struct cec_log_addrs log_addrs; long err = -EBUSY; if (!(adap->capabilities & CEC_CAP_LOG_ADDRS)) return -ENOTTY; if (copy_from_user(&log_addrs, parg, sizeof(log_addrs))) return -EFAULT; log_addrs.flags &= CEC_LOG_ADDRS_FL_ALLOW_UNREG_FALLBACK | CEC_LOG_ADDRS_FL_ALLOW_RC_PASSTHRU | CEC_LOG_ADDRS_FL_CDC_ONLY; mutex_lock(&adap->lock); if (!adap->is_claiming_log_addrs && !adap->is_configuring && (!log_addrs.num_log_addrs || !adap->is_configured) && !cec_is_busy(adap, fh)) { err = __cec_s_log_addrs(adap, &log_addrs, block); if (!err) log_addrs = adap->log_addrs; } mutex_unlock(&adap->lock); if (err) return err; if (copy_to_user(parg, &log_addrs, sizeof(log_addrs))) return -EFAULT; return 0; } static long cec_adap_g_connector_info(struct cec_adapter *adap, struct cec_log_addrs __user *parg) { int ret = 0; if (!(adap->capabilities & CEC_CAP_CONNECTOR_INFO)) return -ENOTTY; mutex_lock(&adap->lock); if (copy_to_user(parg, &adap->conn_info, sizeof(adap->conn_info))) ret = -EFAULT; mutex_unlock(&adap->lock); return ret; } static long cec_transmit(struct cec_adapter *adap, struct cec_fh *fh, bool block, struct cec_msg __user *parg) { struct cec_msg msg = {}; long err = 0; if (!(adap->capabilities & CEC_CAP_TRANSMIT)) return -ENOTTY; if (copy_from_user(&msg, parg, sizeof(msg))) return -EFAULT; mutex_lock(&adap->lock); if (adap->log_addrs.num_log_addrs == 0) err = -EPERM; else if (adap->is_configuring && !msg_is_raw(&msg)) err = -ENONET; else if (cec_is_busy(adap, fh)) err = -EBUSY; else err = cec_transmit_msg_fh(adap, &msg, fh, block); mutex_unlock(&adap->lock); if (err) return err; if (copy_to_user(parg, &msg, sizeof(msg))) return -EFAULT; return 0; } /* Called by CEC_RECEIVE: wait for a message to arrive */ static int cec_receive_msg(struct cec_fh *fh, struct cec_msg *msg, bool block) { u32 timeout = msg->timeout; int res; do { mutex_lock(&fh->lock); /* Are there received messages queued up? */ if (fh->queued_msgs) { /* Yes, return the first one */ struct cec_msg_entry *entry = list_first_entry(&fh->msgs, struct cec_msg_entry, list); list_del(&entry->list); *msg = entry->msg; kfree(entry); fh->queued_msgs--; mutex_unlock(&fh->lock); /* restore original timeout value */ msg->timeout = timeout; return 0; } /* No, return EAGAIN in non-blocking mode or wait */ mutex_unlock(&fh->lock); /* Return when in non-blocking mode */ if (!block) return -EAGAIN; if (msg->timeout) { /* The user specified a timeout */ res = wait_event_interruptible_timeout(fh->wait, fh->queued_msgs, msecs_to_jiffies(msg->timeout)); if (res == 0) res = -ETIMEDOUT; else if (res > 0) res = 0; } else { /* Wait indefinitely */ res = wait_event_interruptible(fh->wait, fh->queued_msgs); } /* Exit on error, otherwise loop to get the new message */ } while (!res); return res; } static long cec_receive(struct cec_adapter *adap, struct cec_fh *fh, bool block, struct cec_msg __user *parg) { struct cec_msg msg = {}; long err; if (copy_from_user(&msg, parg, sizeof(msg))) return -EFAULT; err = cec_receive_msg(fh, &msg, block); if (err) return err; msg.flags = 0; if (copy_to_user(parg, &msg, sizeof(msg))) return -EFAULT; return 0; } static long cec_dqevent(struct cec_adapter *adap, struct cec_fh *fh, bool block, struct cec_event __user *parg) { struct cec_event_entry *ev = NULL; u64 ts = ~0ULL; unsigned int i; unsigned int ev_idx; long err = 0; mutex_lock(&fh->lock); while (!fh->total_queued_events && block) { mutex_unlock(&fh->lock); err = wait_event_interruptible(fh->wait, fh->total_queued_events); if (err) return err; mutex_lock(&fh->lock); } /* Find the oldest event */ for (i = 0; i < CEC_NUM_EVENTS; i++) { struct cec_event_entry *entry = list_first_entry_or_null(&fh->events[i], struct cec_event_entry, list); if (entry && entry->ev.ts <= ts) { ev = entry; ev_idx = i; ts = ev->ev.ts; } } if (!ev) { err = -EAGAIN; goto unlock; } list_del(&ev->list); if (copy_to_user(parg, &ev->ev, sizeof(ev->ev))) err = -EFAULT; if (ev_idx >= CEC_NUM_CORE_EVENTS) kfree(ev); fh->queued_events[ev_idx]--; fh->total_queued_events--; unlock: mutex_unlock(&fh->lock); return err; } static long cec_g_mode(struct cec_adapter *adap, struct cec_fh *fh, u32 __user *parg) { u32 mode = fh->mode_initiator | fh->mode_follower; if (copy_to_user(parg, &mode, sizeof(mode))) return -EFAULT; return 0; } static long cec_s_mode(struct cec_adapter *adap, struct cec_fh *fh, u32 __user *parg) { u32 mode; u8 mode_initiator; u8 mode_follower; bool send_pin_event = false; long err = 0; if (copy_from_user(&mode, parg, sizeof(mode))) return -EFAULT; if (mode & ~(CEC_MODE_INITIATOR_MSK | CEC_MODE_FOLLOWER_MSK)) { dprintk(1, "%s: invalid mode bits set\n", __func__); return -EINVAL; } mode_initiator = mode & CEC_MODE_INITIATOR_MSK; mode_follower = mode & CEC_MODE_FOLLOWER_MSK; if (mode_initiator > CEC_MODE_EXCL_INITIATOR || mode_follower > CEC_MODE_MONITOR_ALL) { dprintk(1, "%s: unknown mode\n", __func__); return -EINVAL; } if (mode_follower == CEC_MODE_MONITOR_ALL && !(adap->capabilities & CEC_CAP_MONITOR_ALL)) { dprintk(1, "%s: MONITOR_ALL not supported\n", __func__); return -EINVAL; } if (mode_follower == CEC_MODE_MONITOR_PIN && !(adap->capabilities & CEC_CAP_MONITOR_PIN)) { dprintk(1, "%s: MONITOR_PIN not supported\n", __func__); return -EINVAL; } /* Follower modes should always be able to send CEC messages */ if ((mode_initiator == CEC_MODE_NO_INITIATOR || !(adap->capabilities & CEC_CAP_TRANSMIT)) && mode_follower >= CEC_MODE_FOLLOWER && mode_follower <= CEC_MODE_EXCL_FOLLOWER_PASSTHRU) { dprintk(1, "%s: cannot transmit\n", __func__); return -EINVAL; } /* Monitor modes require CEC_MODE_NO_INITIATOR */ if (mode_initiator && mode_follower >= CEC_MODE_MONITOR_PIN) { dprintk(1, "%s: monitor modes require NO_INITIATOR\n", __func__); return -EINVAL; } /* Monitor modes require CAP_NET_ADMIN */ if (mode_follower >= CEC_MODE_MONITOR_PIN && !capable(CAP_NET_ADMIN)) return -EPERM; mutex_lock(&adap->lock); /* * You can't become exclusive follower if someone else already * has that job. */ if ((mode_follower == CEC_MODE_EXCL_FOLLOWER || mode_follower == CEC_MODE_EXCL_FOLLOWER_PASSTHRU) && adap->cec_follower && adap->cec_follower != fh) err = -EBUSY; /* * You can't become exclusive initiator if someone else already * has that job. */ if (mode_initiator == CEC_MODE_EXCL_INITIATOR && adap->cec_initiator && adap->cec_initiator != fh) err = -EBUSY; if (!err) { bool old_mon_all = fh->mode_follower == CEC_MODE_MONITOR_ALL; bool new_mon_all = mode_follower == CEC_MODE_MONITOR_ALL; if (old_mon_all != new_mon_all) { if (new_mon_all) err = cec_monitor_all_cnt_inc(adap); else cec_monitor_all_cnt_dec(adap); } } if (!err) { bool old_mon_pin = fh->mode_follower == CEC_MODE_MONITOR_PIN; bool new_mon_pin = mode_follower == CEC_MODE_MONITOR_PIN; if (old_mon_pin != new_mon_pin) { send_pin_event = new_mon_pin; if (new_mon_pin) err = cec_monitor_pin_cnt_inc(adap); else cec_monitor_pin_cnt_dec(adap); } } if (err) { mutex_unlock(&adap->lock); return err; } if (fh->mode_follower == CEC_MODE_FOLLOWER) adap->follower_cnt--; if (mode_follower == CEC_MODE_FOLLOWER) adap->follower_cnt++; if (send_pin_event) { struct cec_event ev = { .flags = CEC_EVENT_FL_INITIAL_STATE, }; ev.event = adap->cec_pin_is_high ? CEC_EVENT_PIN_CEC_HIGH : CEC_EVENT_PIN_CEC_LOW; cec_queue_event_fh(fh, &ev, 0); } if (mode_follower == CEC_MODE_EXCL_FOLLOWER || mode_follower == CEC_MODE_EXCL_FOLLOWER_PASSTHRU) { adap->passthrough = mode_follower == CEC_MODE_EXCL_FOLLOWER_PASSTHRU; adap->cec_follower = fh; } else if (adap->cec_follower == fh) { adap->passthrough = false; adap->cec_follower = NULL; } if (mode_initiator == CEC_MODE_EXCL_INITIATOR) adap->cec_initiator = fh; else if (adap->cec_initiator == fh) adap->cec_initiator = NULL; fh->mode_initiator = mode_initiator; fh->mode_follower = mode_follower; mutex_unlock(&adap->lock); return 0; } static long cec_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct cec_fh *fh = filp->private_data; struct cec_adapter *adap = fh->adap; bool block = !(filp->f_flags & O_NONBLOCK); void __user *parg = (void __user *)arg; if (!cec_is_registered(adap)) return -ENODEV; switch (cmd) { case CEC_ADAP_G_CAPS: return cec_adap_g_caps(adap, parg); case CEC_ADAP_G_PHYS_ADDR: return cec_adap_g_phys_addr(adap, parg); case CEC_ADAP_S_PHYS_ADDR: return cec_adap_s_phys_addr(adap, fh, block, parg); case CEC_ADAP_G_LOG_ADDRS: return cec_adap_g_log_addrs(adap, parg); case CEC_ADAP_S_LOG_ADDRS: return cec_adap_s_log_addrs(adap, fh, block, parg); case CEC_ADAP_G_CONNECTOR_INFO: return cec_adap_g_connector_info(adap, parg); case CEC_TRANSMIT: return cec_transmit(adap, fh, block, parg); case CEC_RECEIVE: return cec_receive(adap, fh, block, parg); case CEC_DQEVENT: return cec_dqevent(adap, fh, block, parg); case CEC_G_MODE: return cec_g_mode(adap, fh, parg); case CEC_S_MODE: return cec_s_mode(adap, fh, parg); default: return -ENOTTY; } } static int cec_open(struct inode *inode, struct file *filp) { struct cec_devnode *devnode = container_of(inode->i_cdev, struct cec_devnode, cdev); struct cec_adapter *adap = to_cec_adapter(devnode); struct cec_fh *fh = kzalloc(sizeof(*fh), GFP_KERNEL); /* * Initial events that are automatically sent when the cec device is * opened. */ struct cec_event ev = { .event = CEC_EVENT_STATE_CHANGE, .flags = CEC_EVENT_FL_INITIAL_STATE, }; unsigned int i; int err; if (!fh) return -ENOMEM; INIT_LIST_HEAD(&fh->msgs); INIT_LIST_HEAD(&fh->xfer_list); for (i = 0; i < CEC_NUM_EVENTS; i++) INIT_LIST_HEAD(&fh->events[i]); mutex_init(&fh->lock); init_waitqueue_head(&fh->wait); fh->mode_initiator = CEC_MODE_INITIATOR; fh->adap = adap; err = cec_get_device(adap); if (err) { kfree(fh); return err; } filp->private_data = fh; /* Queue up initial state events */ ev.state_change.phys_addr = adap->phys_addr; ev.state_change.log_addr_mask = adap->log_addrs.log_addr_mask; ev.state_change.have_conn_info = adap->conn_info.type != CEC_CONNECTOR_TYPE_NO_CONNECTOR; cec_queue_event_fh(fh, &ev, 0); #ifdef CONFIG_CEC_PIN if (adap->pin && adap->pin->ops->read_hpd && !adap->devnode.unregistered) { err = adap->pin->ops->read_hpd(adap); if (err >= 0) { ev.event = err ? CEC_EVENT_PIN_HPD_HIGH : CEC_EVENT_PIN_HPD_LOW; cec_queue_event_fh(fh, &ev, 0); } } if (adap->pin && adap->pin->ops->read_5v && !adap->devnode.unregistered) { err = adap->pin->ops->read_5v(adap); if (err >= 0) { ev.event = err ? CEC_EVENT_PIN_5V_HIGH : CEC_EVENT_PIN_5V_LOW; cec_queue_event_fh(fh, &ev, 0); } } #endif mutex_lock(&devnode->lock); mutex_lock(&devnode->lock_fhs); list_add(&fh->list, &devnode->fhs); mutex_unlock(&devnode->lock_fhs); mutex_unlock(&devnode->lock); return 0; } /* Override for the release function */ static int cec_release(struct inode *inode, struct file *filp) { struct cec_devnode *devnode = cec_devnode_data(filp); struct cec_adapter *adap = to_cec_adapter(devnode); struct cec_fh *fh = filp->private_data; unsigned int i; mutex_lock(&adap->lock); if (adap->cec_initiator == fh) adap->cec_initiator = NULL; if (adap->cec_follower == fh) { adap->cec_follower = NULL; adap->passthrough = false; } if (fh->mode_follower == CEC_MODE_FOLLOWER) adap->follower_cnt--; if (fh->mode_follower == CEC_MODE_MONITOR_PIN) cec_monitor_pin_cnt_dec(adap); if (fh->mode_follower == CEC_MODE_MONITOR_ALL) cec_monitor_all_cnt_dec(adap); mutex_unlock(&adap->lock); mutex_lock(&devnode->lock); mutex_lock(&devnode->lock_fhs); list_del(&fh->list); mutex_unlock(&devnode->lock_fhs); mutex_unlock(&devnode->lock); /* Unhook pending transmits from this filehandle. */ mutex_lock(&adap->lock); while (!list_empty(&fh->xfer_list)) { struct cec_data *data = list_first_entry(&fh->xfer_list, struct cec_data, xfer_list); data->blocking = false; data->fh = NULL; list_del_init(&data->xfer_list); } mutex_unlock(&adap->lock); mutex_lock(&fh->lock); while (!list_empty(&fh->msgs)) { struct cec_msg_entry *entry = list_first_entry(&fh->msgs, struct cec_msg_entry, list); list_del(&entry->list); kfree(entry); } for (i = CEC_NUM_CORE_EVENTS; i < CEC_NUM_EVENTS; i++) { while (!list_empty(&fh->events[i])) { struct cec_event_entry *entry = list_first_entry(&fh->events[i], struct cec_event_entry, list); list_del(&entry->list); kfree(entry); } } mutex_unlock(&fh->lock); kfree(fh); cec_put_device(adap); filp->private_data = NULL; return 0; } const struct file_operations cec_devnode_fops = { .owner = THIS_MODULE, .open = cec_open, .unlocked_ioctl = cec_ioctl, .compat_ioctl = cec_ioctl, .release = cec_release, .poll = cec_poll, }; |
| 28 4 1 6 16 22 16 36 1 1 6 28 3 59 7 9 11 2 9 9 7 2 59 49 10 3 26 1 11 1 35 3 2 1 73 40 42 25 61 58 3 54 6 67 2 7 79 79 57 10 78 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 | // SPDX-License-Identifier: GPL-2.0 #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/exportfs.h> #include <linux/fs_struct.h> #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/uaccess.h> #include <linux/compat.h> #include <linux/nsfs.h> #include "internal.h" #include "mount.h" static long do_sys_name_to_handle(const struct path *path, struct file_handle __user *ufh, void __user *mnt_id, bool unique_mntid, int fh_flags) { long retval; struct file_handle f_handle; int handle_dwords, handle_bytes; struct file_handle *handle = NULL; /* * We need to make sure whether the file system support decoding of * the file handle if decodeable file handle was requested. */ if (!exportfs_can_encode_fh(path->dentry->d_sb->s_export_op, fh_flags)) return -EOPNOTSUPP; /* * A request to encode a connectable handle for a disconnected dentry * is unexpected since AT_EMPTY_PATH is not allowed. */ if (fh_flags & EXPORT_FH_CONNECTABLE && WARN_ON(path->dentry->d_flags & DCACHE_DISCONNECTED)) return -EINVAL; if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) return -EFAULT; if (f_handle.handle_bytes > MAX_HANDLE_SZ) return -EINVAL; handle = kzalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) return -ENOMEM; /* convert handle size to multiple of sizeof(u32) */ handle_dwords = f_handle.handle_bytes >> 2; /* Encode a possibly decodeable/connectable file handle */ retval = exportfs_encode_fh(path->dentry, (struct fid *)handle->f_handle, &handle_dwords, fh_flags); handle->handle_type = retval; /* convert handle size to bytes */ handle_bytes = handle_dwords * sizeof(u32); handle->handle_bytes = handle_bytes; if ((handle->handle_bytes > f_handle.handle_bytes) || (retval == FILEID_INVALID) || (retval < 0)) { /* As per old exportfs_encode_fh documentation * we could return ENOSPC to indicate overflow * But file system returned 255 always. So handle * both the values */ if (retval == FILEID_INVALID || retval == -ENOSPC) retval = -EOVERFLOW; /* * set the handle size to zero so we copy only * non variable part of the file_handle */ handle_bytes = 0; } else { /* * When asked to encode a connectable file handle, encode this * property in the file handle itself, so that we later know * how to decode it. * For sanity, also encode in the file handle if the encoded * object is a directory and verify this during decode, because * decoding directory file handles is quite different than * decoding connectable non-directory file handles. */ if (fh_flags & EXPORT_FH_CONNECTABLE) { handle->handle_type |= FILEID_IS_CONNECTABLE; if (d_is_dir(path->dentry)) handle->handle_type |= FILEID_IS_DIR; } retval = 0; } /* copy the mount id */ if (unique_mntid) { if (put_user(real_mount(path->mnt)->mnt_id_unique, (u64 __user *) mnt_id)) retval = -EFAULT; } else { if (put_user(real_mount(path->mnt)->mnt_id, (int __user *) mnt_id)) retval = -EFAULT; } /* copy the handle */ if (retval != -EFAULT && copy_to_user(ufh, handle, struct_size(handle, f_handle, handle_bytes))) retval = -EFAULT; kfree(handle); return retval; } /** * sys_name_to_handle_at: convert name to handle * @dfd: directory relative to which name is interpreted if not absolute * @name: name that should be converted to handle. * @handle: resulting file handle * @mnt_id: mount id of the file system containing the file * (u64 if AT_HANDLE_MNT_ID_UNIQUE, otherwise int) * @flag: flag value to indicate whether to follow symlink or not * and whether a decodable file handle is required. * * @handle->handle_size indicate the space available to store the * variable part of the file handle in bytes. If there is not * enough space, the field is updated to return the minimum * value required. */ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, struct file_handle __user *, handle, void __user *, mnt_id, int, flag) { struct path path; int lookup_flags; int fh_flags = 0; int err; if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID | AT_HANDLE_MNT_ID_UNIQUE | AT_HANDLE_CONNECTABLE)) return -EINVAL; /* * AT_HANDLE_FID means there is no intention to decode file handle * AT_HANDLE_CONNECTABLE means there is an intention to decode a * connected fd (with known path), so these flags are conflicting. * AT_EMPTY_PATH could be used along with a dfd that refers to a * disconnected non-directory, which cannot be used to encode a * connectable file handle, because its parent is unknown. */ if (flag & AT_HANDLE_CONNECTABLE && flag & (AT_HANDLE_FID | AT_EMPTY_PATH)) return -EINVAL; else if (flag & AT_HANDLE_FID) fh_flags |= EXPORT_FH_FID; else if (flag & AT_HANDLE_CONNECTABLE) fh_flags |= EXPORT_FH_CONNECTABLE; lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; err = user_path_at(dfd, name, lookup_flags, &path); if (!err) { err = do_sys_name_to_handle(&path, handle, mnt_id, flag & AT_HANDLE_MNT_ID_UNIQUE, fh_flags); path_put(&path); } return err; } static int get_path_anchor(int fd, struct path *root) { if (fd >= 0) { CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; *root = fd_file(f)->f_path; path_get(root); return 0; } if (fd == AT_FDCWD) { get_fs_pwd(current->fs, root); return 0; } if (fd == FD_PIDFS_ROOT) { pidfs_get_root(root); return 0; } if (fd == FD_NSFS_ROOT) { nsfs_get_root(root); return 0; } return -EBADF; } static int vfs_dentry_acceptable(void *context, struct dentry *dentry) { struct handle_to_path_ctx *ctx = context; struct user_namespace *user_ns = current_user_ns(); struct dentry *d, *root = ctx->root.dentry; struct mnt_idmap *idmap = mnt_idmap(ctx->root.mnt); int retval = 0; if (!root) return 1; /* Old permission model with global CAP_DAC_READ_SEARCH. */ if (!ctx->flags) return 1; /* * Verify that the decoded dentry itself has a valid id mapping. * In case the decoded dentry is the mountfd root itself, this * verifies that the mountfd inode itself has a valid id mapping. */ if (!privileged_wrt_inode_uidgid(user_ns, idmap, d_inode(dentry))) return 0; /* * It's racy as we're not taking rename_lock but we're able to ignore * permissions and we just need an approximation whether we were able * to follow a path to the file. * * It's also potentially expensive on some filesystems especially if * there is a deep path. */ d = dget(dentry); while (d != root && !IS_ROOT(d)) { struct dentry *parent = dget_parent(d); /* * We know that we have the ability to override DAC permissions * as we've verified this earlier via CAP_DAC_READ_SEARCH. But * we also need to make sure that there aren't any unmapped * inodes in the path that would prevent us from reaching the * file. */ if (!privileged_wrt_inode_uidgid(user_ns, idmap, d_inode(parent))) { dput(d); dput(parent); return retval; } dput(d); d = parent; } if (!(ctx->flags & HANDLE_CHECK_SUBTREE) || d == root) retval = 1; /* * exportfs_decode_fh_raw() does not call acceptable() callback with * a disconnected directory dentry, so we should have reached either * mount fd directory or sb root. */ if (ctx->fh_flags & EXPORT_FH_DIR_ONLY) WARN_ON_ONCE(d != root && d != root->d_sb->s_root); dput(d); return retval; } static int do_handle_to_path(struct file_handle *handle, struct path *path, struct handle_to_path_ctx *ctx) { int handle_dwords; struct vfsmount *mnt = ctx->root.mnt; struct dentry *dentry; /* change the handle size to multiple of sizeof(u32) */ handle_dwords = handle->handle_bytes >> 2; dentry = exportfs_decode_fh_raw(mnt, (struct fid *)handle->f_handle, handle_dwords, handle->handle_type, ctx->fh_flags, vfs_dentry_acceptable, ctx); if (IS_ERR_OR_NULL(dentry)) { if (dentry == ERR_PTR(-ENOMEM)) return -ENOMEM; return -ESTALE; } path->dentry = dentry; path->mnt = mntget(mnt); return 0; } static inline int may_decode_fh(struct handle_to_path_ctx *ctx, unsigned int o_flags) { struct path *root = &ctx->root; if (capable(CAP_DAC_READ_SEARCH)) return 0; /* * Allow relaxed permissions of file handles if the caller has * the ability to mount the filesystem or create a bind-mount of * the provided @mountdirfd. * * In both cases the caller may be able to get an unobstructed * way to the encoded file handle. If the caller is only able to * create a bind-mount we need to verify that there are no * locked mounts on top of it that could prevent us from getting * to the encoded file. * * In principle, locked mounts can prevent the caller from * mounting the filesystem but that only applies to procfs and * sysfs neither of which support decoding file handles. * * Restrict to O_DIRECTORY to provide a deterministic API that * avoids a confusing api in the face of disconnected non-dir * dentries. * * There's only one dentry for each directory inode (VFS rule)... */ if (!(o_flags & O_DIRECTORY)) return -EPERM; if (ns_capable(root->mnt->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) ctx->flags = HANDLE_CHECK_PERMS; else if (is_mounted(root->mnt) && ns_capable(real_mount(root->mnt)->mnt_ns->user_ns, CAP_SYS_ADMIN) && !has_locked_children(real_mount(root->mnt), root->dentry)) ctx->flags = HANDLE_CHECK_PERMS | HANDLE_CHECK_SUBTREE; else return -EPERM; /* Are we able to override DAC permissions? */ if (!ns_capable(current_user_ns(), CAP_DAC_READ_SEARCH)) return -EPERM; ctx->fh_flags = EXPORT_FH_DIR_ONLY; return 0; } static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, struct path *path, unsigned int o_flags) { int retval = 0; struct file_handle f_handle; struct file_handle *handle __free(kfree) = NULL; struct handle_to_path_ctx ctx = {}; const struct export_operations *eops; if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) return -EFAULT; if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || (f_handle.handle_bytes == 0)) return -EINVAL; if (f_handle.handle_type < 0 || FILEID_USER_FLAGS(f_handle.handle_type) & ~FILEID_VALID_USER_FLAGS) return -EINVAL; retval = get_path_anchor(mountdirfd, &ctx.root); if (retval) return retval; eops = ctx.root.mnt->mnt_sb->s_export_op; if (eops && eops->permission) retval = eops->permission(&ctx, o_flags); else retval = may_decode_fh(&ctx, o_flags); if (retval) goto out_path; handle = kmalloc(struct_size(handle, f_handle, f_handle.handle_bytes), GFP_KERNEL); if (!handle) { retval = -ENOMEM; goto out_path; } /* copy the full handle */ *handle = f_handle; if (copy_from_user(&handle->f_handle, &ufh->f_handle, f_handle.handle_bytes)) { retval = -EFAULT; goto out_path; } /* * If handle was encoded with AT_HANDLE_CONNECTABLE, verify that we * are decoding an fd with connected path, which is accessible from * the mount fd path. */ if (f_handle.handle_type & FILEID_IS_CONNECTABLE) { ctx.fh_flags |= EXPORT_FH_CONNECTABLE; ctx.flags |= HANDLE_CHECK_SUBTREE; } if (f_handle.handle_type & FILEID_IS_DIR) ctx.fh_flags |= EXPORT_FH_DIR_ONLY; /* Filesystem code should not be exposed to user flags */ handle->handle_type &= ~FILEID_USER_FLAGS_MASK; retval = do_handle_to_path(handle, path, &ctx); out_path: path_put(&ctx.root); return retval; } static struct file *file_open_handle(struct path *path, int open_flag) { const struct export_operations *eops; eops = path->mnt->mnt_sb->s_export_op; if (eops->open) return eops->open(path, open_flag); return file_open_root(path, "", open_flag, 0); } static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag) { long retval; struct path path __free(path_put) = {}; retval = handle_to_path(mountdirfd, ufh, &path, open_flag); if (retval) return retval; return FD_ADD(open_flag, file_open_handle(&path, open_flag)); } /** * sys_open_by_handle_at: Open the file handle * @mountdirfd: directory file descriptor * @handle: file handle to be opened * @flags: open flags. * * @mountdirfd indicate the directory file descriptor * of the mount point. file handle is decoded relative * to the vfsmount pointed by the @mountdirfd. @flags * value is same as the open(2) flags. */ SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, struct file_handle __user *, handle, int, flags) { long ret; if (force_o_largefile()) flags |= O_LARGEFILE; ret = do_handle_open(mountdirfd, handle, flags); return ret; } #ifdef CONFIG_COMPAT /* * Exactly like fs/open.c:sys_open_by_handle_at(), except that it * doesn't set the O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, struct file_handle __user *, handle, int, flags) { return do_handle_open(mountdirfd, handle, flags); } #endif |
| 338 1 283 1 347 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions of the Internet Protocol. * * Version: @(#)in.h 1.0.1 04/21/93 * * Authors: Original taken from the GNU Project <netinet/in.h> file. * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IN_H #define _LINUX_IN_H #include <linux/errno.h> #include <uapi/linux/in.h> static inline int proto_ports_offset(int proto) { switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_DCCP: case IPPROTO_ESP: /* SPI */ case IPPROTO_SCTP: case IPPROTO_UDPLITE: return 0; case IPPROTO_AH: /* SPI */ return 4; default: return -EINVAL; } } static inline bool ipv4_is_loopback(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x7f000000); } static inline bool ipv4_is_multicast(__be32 addr) { return (addr & htonl(0xf0000000)) == htonl(0xe0000000); } static inline bool ipv4_is_local_multicast(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xe0000000); } static inline bool ipv4_is_lbcast(__be32 addr) { /* limited broadcast */ return addr == htonl(INADDR_BROADCAST); } static inline bool ipv4_is_all_snoopers(__be32 addr) { return addr == htonl(INADDR_ALLSNOOPERS_GROUP); } static inline bool ipv4_is_zeronet(__be32 addr) { return (addr == 0); } /* Special-Use IPv4 Addresses (RFC3330) */ static inline bool ipv4_is_private_10(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x0a000000); } static inline bool ipv4_is_private_172(__be32 addr) { return (addr & htonl(0xfff00000)) == htonl(0xac100000); } static inline bool ipv4_is_private_192(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xc0a80000); } static inline bool ipv4_is_linklocal_169(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000); } static inline bool ipv4_is_anycast_6to4(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0586300); } static inline bool ipv4_is_test_192(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0000200); } static inline bool ipv4_is_test_198(__be32 addr) { return (addr & htonl(0xfffe0000)) == htonl(0xc6120000); } #endif /* _LINUX_IN_H */ |
| 7 3 3 4 13 16 17 20 20 1 20 20 2 20 20 20 20 1 18 20 20 20 19 15 15 20 20 9 11 18 20 20 18 18 13 18 21 6 17 17 17 16 5 1 2 1 1 2 2 1 1 2 2 2 2 3 3 58 58 1 4 3 1 46 46 46 46 46 46 45 46 46 46 45 18 2 2 2 1 1 1 2 1 9 8 10 2 4 5 4 4 4 4 2 2 2 2 1 3 4 4 3 4 4 2 1 3 3 1 3 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 | // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/ctype.h> #include <linux/kmod.h> #include <linux/sort.h> #include <linux/delay.h> #include <linux/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/magic.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/vmalloc.h> #include <linux/delayacct.h> #include <linux/pid_namespace.h> #include <linux/cgroupstats.h> #include <linux/fs_parser.h> #include <trace/events/cgroup.h> /* * pidlists linger the following amount before being destroyed. The goal * is avoiding frequent destruction in the middle of consecutive read calls * Expiring in the middle is a performance problem not a correctness one. * 1 sec should be enough. */ #define CGROUP_PIDLIST_DESTROY_DELAY HZ /* Controllers blocked by the commandline in v1 */ static u16 cgroup_no_v1_mask; /* disable named v1 mounts */ static bool cgroup_no_v1_named; /* Show unavailable controllers in /proc/cgroups */ static bool proc_show_all; /* * pidlist destructions need to be flushed on cgroup destruction. Use a * separate workqueue as flush domain. */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; /* protects cgroup_subsys->release_agent_path */ static DEFINE_SPINLOCK(release_agent_path_lock); bool cgroup1_ssid_disabled(int ssid) { return cgroup_no_v1_mask & (1 << ssid); } static bool cgroup1_subsys_absent(struct cgroup_subsys *ss) { /* Check also dfl_cftypes for file-less controllers, i.e. perf_event */ return ss->legacy_cftypes == NULL && ss->dfl_cftypes; } /** * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task * @tsk: the task to be attached * * Return: %0 on success or a negative errno code on failure */ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { struct cgroup_root *root; int retval = 0; cgroup_lock(); cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); for_each_root(root) { struct cgroup *from_cgrp; spin_lock_irq(&css_set_lock); from_cgrp = task_cgroup_from_root(from, root); spin_unlock_irq(&css_set_lock); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) break; } cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return retval; } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /** * cgroup_transfer_tasks - move tasks from one cgroup to another * @to: cgroup to which the tasks will be moved * @from: cgroup in which the tasks currently reside * * Locking rules between cgroup_post_fork() and the migration path * guarantee that, if a task is forking while being migrated, the new child * is guaranteed to be either visible in the source cgroup after the * parent's migration is complete or put into the target cgroup. No task * can slip out of migration through forking. * * Return: %0 on success or a negative errno code on failure */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { DEFINE_CGROUP_MGCTX(mgctx); struct cgrp_cset_link *link; struct css_task_iter it; struct task_struct *task; int ret; if (cgroup_on_dfl(to)) return -EINVAL; ret = cgroup_migrate_vet_dst(to); if (ret) return ret; cgroup_lock(); cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); /* all tasks in @from are being moved, all csets are source */ spin_lock_irq(&css_set_lock); list_for_each_entry(link, &from->cset_links, cset_link) cgroup_migrate_add_src(link->cset, to, &mgctx); spin_unlock_irq(&css_set_lock); ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_err; /* * Migrate tasks one-by-one until @from is empty. This fails iff * ->can_attach() fails. */ do { css_task_iter_start(&from->self, 0, &it); do { task = css_task_iter_next(&it); } while (task && (task->flags & PF_EXITING)); if (task) get_task_struct(task); css_task_iter_end(&it); if (task) { ret = cgroup_migrate(task, false, &mgctx); if (!ret) TRACE_CGROUP_PATH(transfer_tasks, to, task, false); put_task_struct(task); } } while (task && !ret); out_err: cgroup_migrate_finish(&mgctx); cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); cgroup_unlock(); return ret; } /* * Stuff for reading the 'tasks'/'procs' files. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), * but we cannot guarantee that the information we produce is correct * unless we produce it entirely atomically. * */ /* which pidlist file are we talking about? */ enum cgroup_filetype { CGROUP_FILE_PROCS, CGROUP_FILE_TASKS, }; /* * A pidlist is a list of pids that virtually represents the contents of one * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, * a pair (one each for procs, tasks) for each pid namespace that's relevant * to the cgroup. */ struct cgroup_pidlist { /* * used to find which pidlist is wanted. doesn't change as long as * this particular list stays in the list. */ struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; /* array of xids */ pid_t *list; /* how many elements the above list has */ int length; /* each of these stored in a list by its cgroup */ struct list_head links; /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; /* for delayed destruction */ struct delayed_work destroy_dwork; }; /* * Used to destroy all pidlists lingering waiting for destroy timer. None * should be left afterwards. */ void cgroup1_pidlist_destroy_all(struct cgroup *cgrp) { struct cgroup_pidlist *l, *tmp_l; mutex_lock(&cgrp->pidlist_mutex); list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0); mutex_unlock(&cgrp->pidlist_mutex); flush_workqueue(cgroup_pidlist_destroy_wq); BUG_ON(!list_empty(&cgrp->pidlists)); } static void cgroup_pidlist_destroy_work_fn(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist, destroy_dwork); struct cgroup_pidlist *tofree = NULL; mutex_lock(&l->owner->pidlist_mutex); /* * Destroy iff we didn't get queued again. The state won't change * as destroy_dwork can only be queued while locked. */ if (!delayed_work_pending(dwork)) { list_del(&l->links); kvfree(l->list); put_pid_ns(l->key.ns); tofree = l; } mutex_unlock(&l->owner->pidlist_mutex); kfree(tofree); } /* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries * Returns the number of unique elements. */ static int pidlist_uniq(pid_t *list, int length) { int src, dest = 1; /* * we presume the 0th element is unique, so i starts at 1. trivial * edge cases first; no work needs to be done for either */ if (length == 0 || length == 1) return length; /* src and dest walk down the list; dest counts unique elements */ for (src = 1; src < length; src++) { /* find next unique element */ while (list[src] == list[src-1]) { src++; if (src == length) goto after; } /* dest always points to where the next unique element goes */ list[dest] = list[src]; dest++; } after: return dest; } /* * The two pid files - task and cgroup.procs - guaranteed that the result * is sorted, which forced this whole pidlist fiasco. As pid order is * different per namespace, each namespace needs differently sorted list, * making it impossible to use, for example, single rbtree of member tasks * sorted by task pointer. As pidlists can be fairly large, allocating one * per open file is dangerous, so cgroup had to implement shared pool of * pidlists keyed by cgroup and namespace. */ static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, enum cgroup_filetype type) { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ struct pid_namespace *ns = task_active_pid_ns(current); lockdep_assert_held(&cgrp->pidlist_mutex); list_for_each_entry(l, &cgrp->pidlists, links) if (l->key.type == type && l->key.ns == ns) return l; return NULL; } /* * find the appropriate pidlist for our purpose (given procs vs tasks) * returns with the lock on that pidlist already held, and takes care * of the use count, or returns NULL with no locks held if we're out of * memory. */ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, enum cgroup_filetype type) { struct cgroup_pidlist *l; lockdep_assert_held(&cgrp->pidlist_mutex); l = cgroup_pidlist_find(cgrp, type); if (l) return l; /* entry not found; create a new one */ l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); if (!l) return l; INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn); l->key.type = type; /* don't need task_nsproxy() if we're looking at ourself */ l->key.ns = get_pid_ns(task_active_pid_ns(current)); l->owner = cgrp; list_add(&l->links, &cgrp->pidlists); return l; } /* * Load a cgroup's pidarray with either procs' tgids or tasks' pids */ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, struct cgroup_pidlist **lp) { pid_t *array; int length; int pid, n = 0; /* used for populating the array */ struct css_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; lockdep_assert_held(&cgrp->pidlist_mutex); /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the * caller from the case that the additional cgroup users didn't * show up until sometime later on. */ length = cgroup_task_count(cgrp); array = kvmalloc_array(length, sizeof(pid_t), GFP_KERNEL); if (!array) return -ENOMEM; /* now, populate the array */ css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ if (type == CGROUP_FILE_PROCS) pid = task_tgid_vnr(tsk); else pid = task_pid_vnr(tsk); if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } css_task_iter_end(&it); length = n; /* now sort & strip out duplicates (tgids or recycled thread PIDs) */ sort(array, length, sizeof(pid_t), cmppid, NULL); length = pidlist_uniq(array, length); l = cgroup_pidlist_find_create(cgrp, type); if (!l) { kvfree(array); return -ENOMEM; } /* store array, freeing old if necessary */ kvfree(l->list); l->list = array; l->length = length; *lp = l; return 0; } /* * seq_file methods for the tasks/procs files. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid * in the cgroup->l->list array. */ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) { /* * Initially we receive a position value that corresponds to * one more than the last pid shown (or 0 on the first call or * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; int index = 0, pid = *pos; int *iter, ret; mutex_lock(&cgrp->pidlist_mutex); /* * !NULL @ctx->procs1.pidlist indicates that this isn't the first * start() after open. If the matching pidlist is around, we can use * that. Look for it. Note that @ctx->procs1.pidlist can't be used * directly. It could already have been destroyed. */ if (ctx->procs1.pidlist) ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); /* * Either this is the first start() after open or the matching * pidlist has been destroyed inbetween. Create a new one. */ if (!ctx->procs1.pidlist) { ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); if (ret) return ERR_PTR(ret); } l = ctx->procs1.pidlist; if (pid) { int end = l->length; while (index < end) { int mid = (index + end) / 2; if (l->list[mid] == pid) { index = mid; break; } else if (l->list[mid] < pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ if (index >= l->length) return NULL; /* Update the abstract position to be the actual pid that we found */ iter = l->list + index; *pos = *iter; return iter; } static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup_pidlist *l = ctx->procs1.pidlist; if (l) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, CGROUP_PIDLIST_DESTROY_DELAY); mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; struct cgroup_file_ctx *ctx = of->priv; struct cgroup_pidlist *l = ctx->procs1.pidlist; pid_t *p = v; pid_t *end = l->list + l->length; /* * Advance to the next pid in the array. If this goes off the * end, we're done */ p++; if (p >= end) { (*pos)++; return NULL; } else { *pos = *p; return p; } } static int cgroup_pidlist_show(struct seq_file *s, void *v) { seq_printf(s, "%d\n", *(int *)v); return 0; } static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool threadgroup) { struct cgroup *cgrp; struct task_struct *task; const struct cred *cred, *tcred; ssize_t ret; enum cgroup_attach_lock_mode lock_mode; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); ret = PTR_ERR_OR_ZERO(task); if (ret) goto out_unlock; /* * Even if we're attaching all tasks in the thread group, we only need * to check permissions on one of them. Check permissions using the * credentials from file open to protect against inherited fd attacks. */ cred = of->file->f_cred; tcred = get_task_cred(task); if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && !uid_eq(cred->euid, tcred->suid)) ret = -EACCES; put_cred(tcred); if (ret) goto out_finish; ret = cgroup_attach_task(cgrp, task, threadgroup); out_finish: cgroup_procs_write_finish(task, lock_mode); out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; } static ssize_t cgroup1_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup1_procs_write(of, buf, nbytes, off, true); } static ssize_t cgroup1_tasks_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return __cgroup1_procs_write(of, buf, nbytes, off, false); } static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct cgroup *cgrp; struct cgroup_file_ctx *ctx; BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); /* * Release agent gets called with all capabilities, * require capabilities to set release agent. */ ctx = of->priv; if ((ctx->ns->user_ns != &init_user_ns) || !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN)) return -EPERM; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); strscpy(cgrp->root->release_agent_path, strstrip(buf), sizeof(cgrp->root->release_agent_path)); spin_unlock(&release_agent_path_lock); cgroup_kn_unlock(of->kn); return nbytes; } static int cgroup_release_agent_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; spin_lock(&release_agent_path_lock); seq_puts(seq, cgrp->root->release_agent_path); spin_unlock(&release_agent_path_lock); seq_putc(seq, '\n'); return 0; } static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) { seq_puts(seq, "0\n"); return 0; } static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft) { return notify_on_release(css->cgroup); } static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { if (val) set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); return 0; } static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); } static int cgroup_clone_children_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { if (val) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); else clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); return 0; } /* cgroup core interface files for the legacy hierarchies */ struct cftype cgroup1_base_files[] = { { .name = "cgroup.procs", .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_PROCS, .write = cgroup1_procs_write, }, { .name = "cgroup.clone_children", .read_u64 = cgroup_clone_children_read, .write_u64 = cgroup_clone_children_write, }, { .name = "cgroup.sane_behavior", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_sane_behavior_show, }, { .name = "tasks", .seq_start = cgroup_pidlist_start, .seq_next = cgroup_pidlist_next, .seq_stop = cgroup_pidlist_stop, .seq_show = cgroup_pidlist_show, .private = CGROUP_FILE_TASKS, .write = cgroup1_tasks_write, }, { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, { .name = "release_agent", .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, .write = cgroup_release_agent_write, .max_write_len = PATH_MAX - 1, }, { } /* terminate */ }; /* Display information about each subsystem and each hierarchy */ int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; bool cgrp_v1_visible = false; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); /* * Grab the subsystems state racily. No need to add avenue to * cgroup_mutex contention. */ for_each_subsys(ss, i) { cgrp_v1_visible |= ss->root != &cgrp_dfl_root; if (!proc_show_all && cgroup1_subsys_absent(ss)) continue; seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); } if (cgrp_dfl_visible && !cgrp_v1_visible) pr_info_once("/proc/cgroups lists only v1 controllers, use cgroup.controllers of root cgroup for v2 info\n"); return 0; } /** * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into * @dentry: A dentry entry belonging to the cgroup for which stats have * been requested. * * Build and fill cgroupstats so that taskstats can export it to user * space. * * Return: %0 on success or a negative errno code on failure */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup *cgrp; struct css_task_iter it; struct task_struct *tsk; /* it should be kernfs_node belonging to cgroupfs and is a directory */ if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || kernfs_type(kn) != KERNFS_DIR) return -EINVAL; /* * We aren't being called from kernfs and there's no guarantee on * @kn->priv's validity. For this and css_tryget_online_from_dir(), * @kn->priv is RCU safe. Let's do the RCU dancing. */ rcu_read_lock(); cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv); if (!cgrp || !cgroup_tryget(cgrp)) { rcu_read_unlock(); return -ENOENT; } rcu_read_unlock(); css_task_iter_start(&cgrp->self, 0, &it); while ((tsk = css_task_iter_next(&it))) { switch (READ_ONCE(tsk->__state)) { case TASK_RUNNING: stats->nr_running++; break; case TASK_INTERRUPTIBLE: stats->nr_sleeping++; break; case TASK_UNINTERRUPTIBLE: stats->nr_uninterruptible++; break; case TASK_STOPPED: stats->nr_stopped++; break; default: if (tsk->in_iowait) stats->nr_io_wait++; break; } } css_task_iter_end(&it); cgroup_put(cgrp); return 0; } void cgroup1_check_for_release(struct cgroup *cgrp) { if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) && !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp)) schedule_work(&cgrp->release_agent_work); } /* * Notify userspace when a cgroup is released, by running the * configured release agent with the name of the cgroup (path * relative to the root of cgroup file system) as the argument. * * Most likely, this user command will try to rmdir this cgroup. * * This races with the possibility that some other task will be * attached to this cgroup before it is removed, or that some other * user task will 'mkdir' a child cgroup of this cgroup. That's ok. * The presumed 'rmdir' will fail quietly if this cgroup is no longer * unused, and this cgroup will be reprieved from its death sentence, * to continue to serve a useful existence. Next time it's released, * we will get notified again, if it still has 'notify_on_release' set. * * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which * means only wait until the task is successfully execve()'d. The * separate release agent task is forked by call_usermodehelper(), * then control in this thread returns here, without waiting for the * release agent task. We don't bother to wait because the caller of * this routine has no use for the exit status of the release agent * task, so no sense holding our caller up for that. */ void cgroup1_release_agent(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, release_agent_work); char *pathbuf, *agentbuf; char *argv[3], *envp[3]; int ret; /* snoop agent path and exit early if empty */ if (!cgrp->root->release_agent_path[0]) return; /* prepare argument buffers */ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); agentbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf || !agentbuf) goto out_free; spin_lock(&release_agent_path_lock); strscpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); spin_unlock(&release_agent_path_lock); if (!agentbuf[0]) goto out_free; ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); if (ret < 0) goto out_free; argv[0] = agentbuf; argv[1] = pathbuf; argv[2] = NULL; /* minimal command environment */ envp[0] = "HOME=/"; envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[2] = NULL; call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); out_free: kfree(agentbuf); kfree(pathbuf); } /* * cgroup_rename - Only allow simple rename of directories in place. */ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name_str) { struct cgroup *cgrp = kn->priv; int ret; /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ if (strchr(new_name_str, '\n')) return -EINVAL; if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; if (rcu_access_pointer(kn->__parent) != new_parent) return -EIO; /* * We're gonna grab cgroup_mutex which nests outside kernfs * active_ref. kernfs_rename() doesn't require active_ref * protection. Break them before grabbing cgroup_mutex. */ kernfs_break_active_protection(new_parent); kernfs_break_active_protection(kn); cgroup_lock(); ret = kernfs_rename(kn, new_parent, new_name_str); if (!ret) TRACE_CGROUP_PATH(rename, cgrp); cgroup_unlock(); kernfs_unbreak_active_protection(kn); kernfs_unbreak_active_protection(new_parent); return ret; } static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_root) { struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_show_option(seq, ss->legacy_name, NULL); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); if (root->flags & CGRP_ROOT_CPUSET_V2_MODE) seq_puts(seq, ",cpuset_v2_mode"); if (root->flags & CGRP_ROOT_FAVOR_DYNMODS) seq_puts(seq, ",favordynmods"); spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) seq_show_option(seq, "release_agent", root->release_agent_path); spin_unlock(&release_agent_path_lock); if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_show_option(seq, "name", root->name); return 0; } enum cgroup1_param { Opt_all, Opt_clone_children, Opt_cpuset_v2_mode, Opt_name, Opt_none, Opt_noprefix, Opt_release_agent, Opt_xattr, Opt_favordynmods, Opt_nofavordynmods, }; const struct fs_parameter_spec cgroup1_fs_parameters[] = { fsparam_flag ("all", Opt_all), fsparam_flag ("clone_children", Opt_clone_children), fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), fsparam_string("name", Opt_name), fsparam_flag ("none", Opt_none), fsparam_flag ("noprefix", Opt_noprefix), fsparam_string("release_agent", Opt_release_agent), fsparam_flag ("xattr", Opt_xattr), fsparam_flag ("favordynmods", Opt_favordynmods), fsparam_flag ("nofavordynmods", Opt_nofavordynmods), {} }; int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_subsys *ss; struct fs_parse_result result; int opt, i; opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { int ret; ret = vfs_parse_fs_param_source(fc, param); if (ret != -ENOPARAM) return ret; for_each_subsys(ss, i) { if (strcmp(param->key, ss->legacy_name) || cgroup1_subsys_absent(ss)) continue; if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) return invalfc(fc, "Disabled controller '%s'", param->key); ctx->subsys_mask |= (1 << i); return 0; } return invalfc(fc, "Unknown subsys name '%s'", param->key); } if (opt < 0) return opt; switch (opt) { case Opt_none: /* Explicitly have no subsystems */ ctx->none = true; break; case Opt_all: ctx->all_ss = true; break; case Opt_noprefix: ctx->flags |= CGRP_ROOT_NOPREFIX; break; case Opt_clone_children: ctx->cpuset_clone_children = true; break; case Opt_cpuset_v2_mode: ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; break; case Opt_xattr: ctx->flags |= CGRP_ROOT_XATTR; break; case Opt_favordynmods: ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; break; case Opt_nofavordynmods: ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; break; case Opt_release_agent: /* Specifying two release agents is forbidden */ if (ctx->release_agent) return invalfc(fc, "release_agent respecified"); /* * Release agent gets called with all capabilities, * require capabilities to set release agent. */ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) return invalfc(fc, "Setting release_agent not allowed"); ctx->release_agent = param->string; param->string = NULL; break; case Opt_name: /* blocked by boot param? */ if (cgroup_no_v1_named) return -ENOENT; /* Can't specify an empty name */ if (!param->size) return invalfc(fc, "Empty name"); if (param->size > MAX_CGROUP_ROOT_NAMELEN - 1) return invalfc(fc, "Name too long"); /* Must match [\w.-]+ */ for (i = 0; i < param->size; i++) { char c = param->string[i]; if (isalnum(c)) continue; if ((c == '.') || (c == '-') || (c == '_')) continue; return invalfc(fc, "Invalid name"); } /* Specifying two names is forbidden */ if (ctx->name) return invalfc(fc, "name respecified"); ctx->name = param->string; param->string = NULL; break; } return 0; } static int check_cgroupfs_options(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); u16 mask = U16_MAX; u16 enabled = 0; struct cgroup_subsys *ss; int i; #ifdef CONFIG_CPUSETS mask = ~((u16)1 << cpuset_cgrp_id); #endif for_each_subsys(ss, i) if (cgroup_ssid_enabled(i) && !cgroup1_ssid_disabled(i) && !cgroup1_subsys_absent(ss)) enabled |= 1 << i; ctx->subsys_mask &= enabled; /* * In absence of 'none', 'name=' and subsystem name options, * let's default to 'all'. */ if (!ctx->subsys_mask && !ctx->none && !ctx->name) ctx->all_ss = true; if (ctx->all_ss) { /* Mutually exclusive option 'all' + subsystem name */ if (ctx->subsys_mask) return invalfc(fc, "subsys name conflicts with all"); /* 'all' => select all the subsystems */ ctx->subsys_mask = enabled; } /* * We either have to specify by name or by subsystems. (So all * empty hierarchies must have a name). */ if (!ctx->subsys_mask && !ctx->name) return invalfc(fc, "Need name or subsystem set"); /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ if ((ctx->flags & CGRP_ROOT_NOPREFIX) && (ctx->subsys_mask & mask)) return invalfc(fc, "noprefix used incorrectly"); /* Can't specify "none" and some subsystems */ if (ctx->subsys_mask && ctx->none) return invalfc(fc, "none used incorrectly"); return 0; } int cgroup1_reconfigure(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct kernfs_root *kf_root = kernfs_root_from_sb(fc->root->d_sb); struct cgroup_root *root = cgroup_root_from_kf(kf_root); int ret = 0; u16 added_mask, removed_mask; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ ret = check_cgroupfs_options(fc); if (ret) goto out_unlock; if (ctx->subsys_mask != root->subsys_mask || ctx->release_agent) pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n", task_tgid_nr(current), current->comm); added_mask = ctx->subsys_mask & ~root->subsys_mask; removed_mask = root->subsys_mask & ~ctx->subsys_mask; /* Don't allow flags or name to change at remount */ if ((ctx->flags ^ root->flags) || (ctx->name && strcmp(ctx->name, root->name))) { errorfc(fc, "option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"", ctx->flags, ctx->name ?: "", root->flags, root->name); ret = -EINVAL; goto out_unlock; } /* remounting is not allowed for populated hierarchies */ if (!list_empty(&root->cgrp.self.children)) { ret = -EBUSY; goto out_unlock; } ret = rebind_subsystems(root, added_mask); if (ret) goto out_unlock; WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); if (ctx->release_agent) { spin_lock(&release_agent_path_lock); strscpy(root->release_agent_path, ctx->release_agent); spin_unlock(&release_agent_path_lock); } trace_cgroup_remount(root); out_unlock: cgroup_unlock(); return ret; } struct kernfs_syscall_ops cgroup1_kf_syscall_ops = { .rename = cgroup1_rename, .show_options = cgroup1_show_options, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, .show_path = cgroup_show_path, }; /* * The guts of cgroup1 mount - find or create cgroup_root to use. * Called with cgroup_mutex held; returns 0 on success, -E... on * error and positive - in case when the candidate is busy dying. * On success it stashes a reference to cgroup_root into given * cgroup_fs_context; that reference is *NOT* counting towards the * cgroup_root refcount. */ static int cgroup1_root_to_use(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); struct cgroup_root *root; struct cgroup_subsys *ss; int i, ret; /* First find the desired set of subsystems */ ret = check_cgroupfs_options(fc); if (ret) return ret; /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the * dying subsystems. We just need to ensure that the ones * unmounted previously finish dying and don't care about new ones * starting. Testing ref liveliness is good enough. */ for_each_subsys(ss, i) { if (!(ctx->subsys_mask & (1 << i)) || ss->root == &cgrp_dfl_root) continue; if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) return 1; /* restart */ cgroup_put(&ss->root->cgrp); } for_each_root(root) { bool name_match = false; if (root == &cgrp_dfl_root) continue; /* * If we asked for a name then it must match. Also, if * name matches but sybsys_mask doesn't, we should fail. * Remember whether name matched. */ if (ctx->name) { if (strcmp(ctx->name, root->name)) continue; name_match = true; } /* * If we asked for subsystems (or explicitly for no * subsystems) then they must match. */ if ((ctx->subsys_mask || ctx->none) && (ctx->subsys_mask != root->subsys_mask)) { if (!name_match) continue; return -EBUSY; } if (root->flags ^ ctx->flags) pr_warn("new mount options do not match the existing superblock, will be ignored\n"); ctx->root = root; return 0; } /* * No such thing, create a new one. name= matching without subsys * specification is allowed for already existing hierarchies but we * can't create new one without subsys specification. */ if (!ctx->subsys_mask && !ctx->none) return invalfc(fc, "No subsys list or none specified"); /* Hierarchies may only be created in the initial cgroup namespace. */ if (ctx->ns != &init_cgroup_ns) return -EPERM; root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) return -ENOMEM; ctx->root = root; init_cgroup_root(ctx); ret = cgroup_setup_root(root, ctx->subsys_mask); if (!ret) cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS); else cgroup_free_root(root); return ret; } int cgroup1_get_tree(struct fs_context *fc) { struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; /* Check if the caller has permission to mount. */ if (!ns_capable(ctx->ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); ret = cgroup1_root_to_use(fc); if (!ret && !percpu_ref_tryget_live(&ctx->root->cgrp.self.refcnt)) ret = 1; /* restart */ cgroup_unlock(); if (!ret) ret = cgroup_do_get_tree(fc); if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) { fc_drop_locked(fc); ret = 1; } if (unlikely(ret > 0)) { msleep(10); return restart_syscall(); } return ret; } /** * task_get_cgroup1 - Acquires the associated cgroup of a task within a * specific cgroup1 hierarchy. The cgroup1 hierarchy is identified by its * hierarchy ID. * @tsk: The target task * @hierarchy_id: The ID of a cgroup1 hierarchy * * On success, the cgroup is returned. On failure, ERR_PTR is returned. * We limit it to cgroup1 only. */ struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id) { struct cgroup *cgrp = ERR_PTR(-ENOENT); struct cgroup_root *root; unsigned long flags; rcu_read_lock(); for_each_root(root) { /* cgroup1 only*/ if (root == &cgrp_dfl_root) continue; if (root->hierarchy_id != hierarchy_id) continue; spin_lock_irqsave(&css_set_lock, flags); cgrp = task_cgroup_from_root(tsk, root); if (!cgrp || !cgroup_tryget(cgrp)) cgrp = ERR_PTR(-ENOENT); spin_unlock_irqrestore(&css_set_lock, flags); break; } rcu_read_unlock(); return cgrp; } static int __init cgroup1_wq_init(void) { /* * Used to destroy pidlists and separate to serve as flush domain. * Cap @max_active to 1 too. */ cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", WQ_PERCPU, 1); BUG_ON(!cgroup_pidlist_destroy_wq); return 0; } core_initcall(cgroup1_wq_init); static int __init cgroup_no_v1(char *str) { struct cgroup_subsys *ss; char *token; int i; while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; if (!strcmp(token, "all")) { cgroup_no_v1_mask = U16_MAX; continue; } if (!strcmp(token, "named")) { cgroup_no_v1_named = true; continue; } for_each_subsys(ss, i) { if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; cgroup_no_v1_mask |= 1 << i; break; } } return 1; } __setup("cgroup_no_v1=", cgroup_no_v1); static int __init cgroup_v1_proc(char *str) { return (kstrtobool(str, &proc_show_all) == 0); } __setup("cgroup_v1_proc=", cgroup_v1_proc); |
| 91 178 34 14358 711 3088 562 25 34 36 22 2552 213 52 52 2174 4777 597 2591 2524 1762 1768 209 13109 5 7 462 5 38 6 1198 27 116 358 764 400 220 27 177 400 3 3 351 28 92 144 6 39 2 57 44 4 101 8 7 103 99 319 317 15547 16966 148 222 1224 148 1615 263 16966 23 310 15531 12 15560 117 15148 148 148 148 148 89 15 307 30 15 307 6 1401 333 95 166 13766 167 171 1294 897 895 761 758 60 70 99 8 938 8 6 41 13869 100 80 82 123 1 13923 1260 243 53 1322 241 62 1114 121 969 90 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Interfaces handler. * * Version: @(#)dev.h 1.0.10 08/12/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Donald J. Becker, <becker@cesdis.gsfc.nasa.gov> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Bjorn Ekwall. <bj0rn@blox.se> * Pekka Riikonen <priikone@poseidon.pspt.fi> * * Moved to /usr/include/linux for NET3 */ #ifndef _LINUX_NETDEVICE_H #define _LINUX_NETDEVICE_H #include <linux/timer.h> #include <linux/bug.h> #include <linux/delay.h> #include <linux/atomic.h> #include <linux/prefetch.h> #include <asm/cache.h> #include <asm/byteorder.h> #include <asm/local.h> #include <linux/percpu.h> #include <linux/rculist.h> #include <linux/workqueue.h> #include <linux/dynamic_queue_limits.h> #include <net/net_namespace.h> #ifdef CONFIG_DCB #include <net/dcbnl.h> #endif #include <net/netprio_cgroup.h> #include <linux/netdev_features.h> #include <linux/neighbour.h> #include <linux/netdevice_xmit.h> #include <uapi/linux/netdevice.h> #include <uapi/linux/if_bonding.h> #include <uapi/linux/pkt_cls.h> #include <uapi/linux/netdev.h> #include <linux/hashtable.h> #include <linux/rbtree.h> #include <net/net_trackers.h> #include <net/net_debug.h> #include <net/dropreason-core.h> #include <net/neighbour_tables.h> struct netpoll_info; struct device; struct ethtool_ops; struct kernel_hwtstamp_config; struct phy_device; struct dsa_port; struct ip_tunnel_parm_kern; struct macsec_context; struct macsec_ops; struct netdev_config; struct netdev_name_node; struct sd_flow_limit; struct sfp_bus; /* 802.11 specific */ struct wireless_dev; /* 802.15.4 specific */ struct wpan_dev; struct mpls_dev; /* UDP Tunnel offloads */ struct udp_tunnel_info; struct udp_tunnel_nic_info; struct udp_tunnel_nic; struct bpf_prog; struct xdp_buff; struct xdp_frame; struct xdp_metadata_ops; struct xdp_md; struct ethtool_netdev_state; struct phy_link_topology; struct hwtstamp_provider; typedef u32 xdp_features_t; void synchronize_net(void); void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops); void netdev_sw_irq_coalesce_default_on(struct net_device *dev); /* Backlog congestion levels */ #define NET_RX_SUCCESS 0 /* keep 'em coming, baby */ #define NET_RX_DROP 1 /* packet dropped */ #define MAX_NEST_DEV 8 /* * Transmit return codes: transmit return codes originate from three different * namespaces: * * - qdisc return codes * - driver transmit return codes * - errno values * * Drivers are allowed to return any one of those in their hard_start_xmit() * function. Real network devices commonly used with qdiscs should only return * the driver transmit return codes though - when qdiscs are used, the actual * transmission happens asynchronously, so the value is not propagated to * higher layers. Virtual network devices transmit synchronously; in this case * the driver transmit return codes are consumed by dev_queue_xmit(), and all * others are propagated to higher layers. */ /* qdisc ->enqueue() return codes. */ #define NET_XMIT_SUCCESS 0x00 #define NET_XMIT_DROP 0x01 /* skb dropped */ #define NET_XMIT_CN 0x02 /* congestion notification */ #define NET_XMIT_MASK 0x0f /* qdisc flags in net/sch_generic.h */ /* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It * indicates that the device will soon be dropping packets, or already drops * some packets of the same priority; prompting us to send less aggressively. */ #define net_xmit_eval(e) ((e) == NET_XMIT_CN ? 0 : (e)) #define net_xmit_errno(e) ((e) != NET_XMIT_CN ? -ENOBUFS : 0) /* Driver transmit return codes */ #define NETDEV_TX_MASK 0xf0 enum netdev_tx { __NETDEV_TX_MIN = INT_MIN, /* make sure enum is signed */ NETDEV_TX_OK = 0x00, /* driver took care of packet */ NETDEV_TX_BUSY = 0x10, /* driver tx path was busy*/ }; typedef enum netdev_tx netdev_tx_t; /* * Current order: NETDEV_TX_MASK > NET_XMIT_MASK >= 0 is significant; * hard_start_xmit() return < NET_XMIT_MASK means skb was consumed. */ static inline bool dev_xmit_complete(int rc) { /* * Positive cases with an skb consumed by a driver: * - successful transmission (rc == NETDEV_TX_OK) * - error while transmitting (rc < 0) * - error while queueing to a different device (rc & NET_XMIT_MASK) */ if (likely(rc < NET_XMIT_MASK)) return true; return false; } /* * Compute the worst-case header length according to the protocols * used. */ #if defined(CONFIG_HYPERV_NET) # define LL_MAX_HEADER 128 #elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25) # if defined(CONFIG_MAC80211_MESH) # define LL_MAX_HEADER 128 # else # define LL_MAX_HEADER 96 # endif #else # define LL_MAX_HEADER 32 #endif #if !IS_ENABLED(CONFIG_NET_IPIP) && !IS_ENABLED(CONFIG_NET_IPGRE) && \ !IS_ENABLED(CONFIG_IPV6_SIT) && !IS_ENABLED(CONFIG_IPV6_TUNNEL) #define MAX_HEADER LL_MAX_HEADER #else #define MAX_HEADER (LL_MAX_HEADER + 48) #endif /* * Old network device statistics. Fields are native words * (unsigned long) so they can be read and written atomically. */ #define NET_DEV_STAT(FIELD) \ union { \ unsigned long FIELD; \ atomic_long_t __##FIELD; \ } struct net_device_stats { NET_DEV_STAT(rx_packets); NET_DEV_STAT(tx_packets); NET_DEV_STAT(rx_bytes); NET_DEV_STAT(tx_bytes); NET_DEV_STAT(rx_errors); NET_DEV_STAT(tx_errors); NET_DEV_STAT(rx_dropped); NET_DEV_STAT(tx_dropped); NET_DEV_STAT(multicast); NET_DEV_STAT(collisions); NET_DEV_STAT(rx_length_errors); NET_DEV_STAT(rx_over_errors); NET_DEV_STAT(rx_crc_errors); NET_DEV_STAT(rx_frame_errors); NET_DEV_STAT(rx_fifo_errors); NET_DEV_STAT(rx_missed_errors); NET_DEV_STAT(tx_aborted_errors); NET_DEV_STAT(tx_carrier_errors); NET_DEV_STAT(tx_fifo_errors); NET_DEV_STAT(tx_heartbeat_errors); NET_DEV_STAT(tx_window_errors); NET_DEV_STAT(rx_compressed); NET_DEV_STAT(tx_compressed); }; #undef NET_DEV_STAT /* per-cpu stats, allocated on demand. * Try to fit them in a single cache line, for dev_get_stats() sake. */ struct net_device_core_stats { unsigned long rx_dropped; unsigned long tx_dropped; unsigned long rx_nohandler; unsigned long rx_otherhost_dropped; } __aligned(4 * sizeof(unsigned long)); #include <linux/cache.h> #include <linux/skbuff.h> struct neighbour; struct neigh_parms; struct sk_buff; struct netdev_hw_addr { struct list_head list; struct rb_node node; unsigned char addr[MAX_ADDR_LEN]; unsigned char type; #define NETDEV_HW_ADDR_T_LAN 1 #define NETDEV_HW_ADDR_T_SAN 2 #define NETDEV_HW_ADDR_T_UNICAST 3 #define NETDEV_HW_ADDR_T_MULTICAST 4 bool global_use; int sync_cnt; int refcount; int synced; struct rcu_head rcu_head; }; struct netdev_hw_addr_list { struct list_head list; int count; /* Auxiliary tree for faster lookup on addition and deletion */ struct rb_root tree; }; #define netdev_hw_addr_list_count(l) ((l)->count) #define netdev_hw_addr_list_empty(l) (netdev_hw_addr_list_count(l) == 0) #define netdev_hw_addr_list_for_each(ha, l) \ list_for_each_entry(ha, &(l)->list, list) #define netdev_uc_count(dev) netdev_hw_addr_list_count(&(dev)->uc) #define netdev_uc_empty(dev) netdev_hw_addr_list_empty(&(dev)->uc) #define netdev_for_each_uc_addr(ha, dev) \ netdev_hw_addr_list_for_each(ha, &(dev)->uc) #define netdev_for_each_synced_uc_addr(_ha, _dev) \ netdev_for_each_uc_addr((_ha), (_dev)) \ if ((_ha)->sync_cnt) #define netdev_mc_count(dev) netdev_hw_addr_list_count(&(dev)->mc) #define netdev_mc_empty(dev) netdev_hw_addr_list_empty(&(dev)->mc) #define netdev_for_each_mc_addr(ha, dev) \ netdev_hw_addr_list_for_each(ha, &(dev)->mc) #define netdev_for_each_synced_mc_addr(_ha, _dev) \ netdev_for_each_mc_addr((_ha), (_dev)) \ if ((_ha)->sync_cnt) struct hh_cache { unsigned int hh_len; seqlock_t hh_lock; /* cached hardware header; allow for machine alignment needs. */ #define HH_DATA_MOD 16 #define HH_DATA_OFF(__len) \ (HH_DATA_MOD - (((__len - 1) & (HH_DATA_MOD - 1)) + 1)) #define HH_DATA_ALIGN(__len) \ (((__len)+(HH_DATA_MOD-1))&~(HH_DATA_MOD - 1)) unsigned long hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)]; }; /* Reserve HH_DATA_MOD byte-aligned hard_header_len, but at least that much. * Alternative is: * dev->hard_header_len ? (dev->hard_header_len + * (HH_DATA_MOD - 1)) & ~(HH_DATA_MOD - 1) : 0 * * We could use other alignment values, but we must maintain the * relationship HH alignment <= LL alignment. */ #define LL_RESERVED_SPACE(dev) \ ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom)) \ & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD) #define LL_RESERVED_SPACE_EXTRA(dev,extra) \ ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom) + (extra)) \ & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD) struct header_ops { int (*create) (struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len); int (*parse)(const struct sk_buff *skb, unsigned char *haddr); int (*cache)(const struct neighbour *neigh, struct hh_cache *hh, __be16 type); void (*cache_update)(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr); bool (*validate)(const char *ll_header, unsigned int len); __be16 (*parse_protocol)(const struct sk_buff *skb); }; /* These flag bits are private to the generic network queueing * layer; they may not be explicitly referenced by any other * code. */ enum netdev_state_t { __LINK_STATE_START, __LINK_STATE_PRESENT, __LINK_STATE_NOCARRIER, __LINK_STATE_LINKWATCH_PENDING, __LINK_STATE_DORMANT, __LINK_STATE_TESTING, }; struct gro_list { struct list_head list; int count; }; /* * size of gro hash buckets, must be <= the number of bits in * gro_node::bitmask */ #define GRO_HASH_BUCKETS 8 /** * struct gro_node - structure to support Generic Receive Offload * @bitmask: bitmask to indicate used buckets in @hash * @hash: hashtable of pending aggregated skbs, separated by flows * @rx_list: list of pending ``GRO_NORMAL`` skbs * @rx_count: cached current length of @rx_list * @cached_napi_id: napi_struct::napi_id cached for hotpath, 0 for standalone */ struct gro_node { unsigned long bitmask; struct gro_list hash[GRO_HASH_BUCKETS]; struct list_head rx_list; u32 rx_count; u32 cached_napi_id; }; /* * Structure for per-NAPI config */ struct napi_config { u64 gro_flush_timeout; u64 irq_suspend_timeout; u32 defer_hard_irqs; cpumask_t affinity_mask; u8 threaded; unsigned int napi_id; }; /* * Structure for NAPI scheduling similar to tasklet but with weighting */ struct napi_struct { /* The poll_list must only be managed by the entity which * changes the state of the NAPI_STATE_SCHED bit. This means * whoever atomically sets that bit can add this napi_struct * to the per-CPU poll_list, and whoever clears that bit * can remove from the list right before clearing the bit. */ struct list_head poll_list; unsigned long state; int weight; u32 defer_hard_irqs_count; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL /* CPU actively polling if netpoll is configured */ int poll_owner; #endif /* CPU on which NAPI has been scheduled for processing */ int list_owner; struct net_device *dev; struct sk_buff *skb; struct gro_node gro; struct hrtimer timer; /* all fields past this point are write-protected by netdev_lock */ struct task_struct *thread; unsigned long gro_flush_timeout; unsigned long irq_suspend_timeout; u32 defer_hard_irqs; /* control-path-only fields follow */ u32 napi_id; struct list_head dev_list; struct hlist_node napi_hash_node; int irq; struct irq_affinity_notify notify; int napi_rmap_idx; int index; struct napi_config *config; }; enum { NAPI_STATE_SCHED, /* Poll is scheduled */ NAPI_STATE_MISSED, /* reschedule a napi */ NAPI_STATE_DISABLE, /* Disable pending */ NAPI_STATE_NPSVC, /* Netpoll - don't dequeue from poll_list */ NAPI_STATE_LISTED, /* NAPI added to system lists */ NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */ NAPI_STATE_HAS_NOTIFIER, /* Napi has an IRQ notifier */ }; enum { NAPIF_STATE_SCHED = BIT(NAPI_STATE_SCHED), NAPIF_STATE_MISSED = BIT(NAPI_STATE_MISSED), NAPIF_STATE_DISABLE = BIT(NAPI_STATE_DISABLE), NAPIF_STATE_NPSVC = BIT(NAPI_STATE_NPSVC), NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED), NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED), NAPIF_STATE_HAS_NOTIFIER = BIT(NAPI_STATE_HAS_NOTIFIER), }; enum gro_result { GRO_MERGED, GRO_MERGED_FREE, GRO_HELD, GRO_NORMAL, GRO_CONSUMED, }; typedef enum gro_result gro_result_t; /* * enum rx_handler_result - Possible return values for rx_handlers. * @RX_HANDLER_CONSUMED: skb was consumed by rx_handler, do not process it * further. * @RX_HANDLER_ANOTHER: Do another round in receive path. This is indicated in * case skb->dev was changed by rx_handler. * @RX_HANDLER_EXACT: Force exact delivery, no wildcard. * @RX_HANDLER_PASS: Do nothing, pass the skb as if no rx_handler was called. * * rx_handlers are functions called from inside __netif_receive_skb(), to do * special processing of the skb, prior to delivery to protocol handlers. * * Currently, a net_device can only have a single rx_handler registered. Trying * to register a second rx_handler will return -EBUSY. * * To register a rx_handler on a net_device, use netdev_rx_handler_register(). * To unregister a rx_handler on a net_device, use * netdev_rx_handler_unregister(). * * Upon return, rx_handler is expected to tell __netif_receive_skb() what to * do with the skb. * * If the rx_handler consumed the skb in some way, it should return * RX_HANDLER_CONSUMED. This is appropriate when the rx_handler arranged for * the skb to be delivered in some other way. * * If the rx_handler changed skb->dev, to divert the skb to another * net_device, it should return RX_HANDLER_ANOTHER. The rx_handler for the * new device will be called if it exists. * * If the rx_handler decides the skb should be ignored, it should return * RX_HANDLER_EXACT. The skb will only be delivered to protocol handlers that * are registered on exact device (ptype->dev == skb->dev). * * If the rx_handler didn't change skb->dev, but wants the skb to be normally * delivered, it should return RX_HANDLER_PASS. * * A device without a registered rx_handler will behave as if rx_handler * returned RX_HANDLER_PASS. */ enum rx_handler_result { RX_HANDLER_CONSUMED, RX_HANDLER_ANOTHER, RX_HANDLER_EXACT, RX_HANDLER_PASS, }; typedef enum rx_handler_result rx_handler_result_t; typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb); void __napi_schedule(struct napi_struct *n); void __napi_schedule_irqoff(struct napi_struct *n); static inline bool napi_disable_pending(struct napi_struct *n) { return test_bit(NAPI_STATE_DISABLE, &n->state); } static inline bool napi_prefer_busy_poll(struct napi_struct *n) { return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); } /** * napi_is_scheduled - test if NAPI is scheduled * @n: NAPI context * * This check is "best-effort". With no locking implemented, * a NAPI can be scheduled or terminate right after this check * and produce not precise results. * * NAPI_STATE_SCHED is an internal state, napi_is_scheduled * should not be used normally and napi_schedule should be * used instead. * * Use only if the driver really needs to check if a NAPI * is scheduled for example in the context of delayed timer * that can be skipped if a NAPI is already scheduled. * * Return: True if NAPI is scheduled, False otherwise. */ static inline bool napi_is_scheduled(struct napi_struct *n) { return test_bit(NAPI_STATE_SCHED, &n->state); } bool napi_schedule_prep(struct napi_struct *n); /** * napi_schedule - schedule NAPI poll * @n: NAPI context * * Schedule NAPI poll routine to be called if it is not already * running. * Return: true if we schedule a NAPI or false if not. * Refer to napi_schedule_prep() for additional reason on why * a NAPI might not be scheduled. */ static inline bool napi_schedule(struct napi_struct *n) { if (napi_schedule_prep(n)) { __napi_schedule(n); return true; } return false; } /** * napi_schedule_irqoff - schedule NAPI poll * @n: NAPI context * * Variant of napi_schedule(), assuming hard irqs are masked. */ static inline void napi_schedule_irqoff(struct napi_struct *n) { if (napi_schedule_prep(n)) __napi_schedule_irqoff(n); } /** * napi_complete_done - NAPI processing complete * @n: NAPI context * @work_done: number of packets processed * * Mark NAPI processing as complete. Should only be called if poll budget * has not been completely consumed. * Prefer over napi_complete(). * Return: false if device should avoid rearming interrupts. */ bool napi_complete_done(struct napi_struct *n, int work_done); static inline bool napi_complete(struct napi_struct *n) { return napi_complete_done(n, 0); } void netif_threaded_enable(struct net_device *dev); int dev_set_threaded(struct net_device *dev, enum netdev_napi_threaded threaded); void napi_disable(struct napi_struct *n); void napi_disable_locked(struct napi_struct *n); void napi_enable(struct napi_struct *n); void napi_enable_locked(struct napi_struct *n); /** * napi_synchronize - wait until NAPI is not running * @n: NAPI context * * Wait until NAPI is done being scheduled on this context. * Waits till any outstanding processing completes but * does not disable future activations. */ static inline void napi_synchronize(const struct napi_struct *n) { if (IS_ENABLED(CONFIG_SMP)) while (test_bit(NAPI_STATE_SCHED, &n->state)) msleep(1); else barrier(); } /** * napi_if_scheduled_mark_missed - if napi is running, set the * NAPIF_STATE_MISSED * @n: NAPI context * * If napi is running, set the NAPIF_STATE_MISSED, and return true if * NAPI is scheduled. **/ static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n) { unsigned long val, new; val = READ_ONCE(n->state); do { if (val & NAPIF_STATE_DISABLE) return true; if (!(val & NAPIF_STATE_SCHED)) return false; new = val | NAPIF_STATE_MISSED; } while (!try_cmpxchg(&n->state, &val, new)); return true; } enum netdev_queue_state_t { __QUEUE_STATE_DRV_XOFF, __QUEUE_STATE_STACK_XOFF, __QUEUE_STATE_FROZEN, }; #define QUEUE_STATE_DRV_XOFF (1 << __QUEUE_STATE_DRV_XOFF) #define QUEUE_STATE_STACK_XOFF (1 << __QUEUE_STATE_STACK_XOFF) #define QUEUE_STATE_FROZEN (1 << __QUEUE_STATE_FROZEN) #define QUEUE_STATE_ANY_XOFF (QUEUE_STATE_DRV_XOFF | QUEUE_STATE_STACK_XOFF) #define QUEUE_STATE_ANY_XOFF_OR_FROZEN (QUEUE_STATE_ANY_XOFF | \ QUEUE_STATE_FROZEN) #define QUEUE_STATE_DRV_XOFF_OR_FROZEN (QUEUE_STATE_DRV_XOFF | \ QUEUE_STATE_FROZEN) /* * __QUEUE_STATE_DRV_XOFF is used by drivers to stop the transmit queue. The * netif_tx_* functions below are used to manipulate this flag. The * __QUEUE_STATE_STACK_XOFF flag is used by the stack to stop the transmit * queue independently. The netif_xmit_*stopped functions below are called * to check if the queue has been stopped by the driver or stack (either * of the XOFF bits are set in the state). Drivers should not need to call * netif_xmit*stopped functions, they should only be using netif_tx_*. */ struct netdev_queue { /* * read-mostly part */ struct net_device *dev; netdevice_tracker dev_tracker; struct Qdisc __rcu *qdisc; struct Qdisc __rcu *qdisc_sleeping; #ifdef CONFIG_SYSFS struct kobject kobj; const struct attribute_group **groups; #endif unsigned long tx_maxrate; /* * Number of TX timeouts for this queue * (/sys/class/net/DEV/Q/trans_timeout) */ atomic_long_t trans_timeout; /* Subordinate device that the queue has been assigned to */ struct net_device *sb_dev; #ifdef CONFIG_XDP_SOCKETS /* "ops protected", see comment about net_device::lock */ struct xsk_buff_pool *pool; #endif /* * write-mostly part */ #ifdef CONFIG_BQL struct dql dql; #endif spinlock_t _xmit_lock ____cacheline_aligned_in_smp; int xmit_lock_owner; /* * Time (in jiffies) of last Tx */ unsigned long trans_start; unsigned long state; /* * slow- / control-path part */ /* NAPI instance for the queue * "ops protected", see comment about net_device::lock */ struct napi_struct *napi; #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) int numa_node; #endif } ____cacheline_aligned_in_smp; extern int sysctl_fb_tunnels_only_for_init_net; extern int sysctl_devconf_inherit_init_net; /* * sysctl_fb_tunnels_only_for_init_net == 0 : For all netns * == 1 : For initns only * == 2 : For none. */ static inline bool net_has_fallback_tunnels(const struct net *net) { #if IS_ENABLED(CONFIG_SYSCTL) int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net); return !fb_tunnels_only_for_init_net || (net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1); #else return true; #endif } static inline int net_inherit_devconf(void) { #if IS_ENABLED(CONFIG_SYSCTL) return READ_ONCE(sysctl_devconf_inherit_init_net); #else return 0; #endif } static inline int netdev_queue_numa_node_read(const struct netdev_queue *q) { #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) return q->numa_node; #else return NUMA_NO_NODE; #endif } static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node) { #if defined(CONFIG_XPS) && defined(CONFIG_NUMA) q->numa_node = node; #endif } #ifdef CONFIG_RFS_ACCEL bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id, u16 filter_id); #endif /* XPS map type and offset of the xps map within net_device->xps_maps[]. */ enum xps_map_type { XPS_CPUS = 0, XPS_RXQS, XPS_MAPS_MAX, }; #ifdef CONFIG_XPS /* * This structure holds an XPS map which can be of variable length. The * map is an array of queues. */ struct xps_map { unsigned int len; unsigned int alloc_len; struct rcu_head rcu; u16 queues[]; }; #define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + ((_num) * sizeof(u16))) #define XPS_MIN_MAP_ALLOC ((L1_CACHE_ALIGN(offsetof(struct xps_map, queues[1])) \ - sizeof(struct xps_map)) / sizeof(u16)) /* * This structure holds all XPS maps for device. Maps are indexed by CPU. * * We keep track of the number of cpus/rxqs used when the struct is allocated, * in nr_ids. This will help not accessing out-of-bound memory. * * We keep track of the number of traffic classes used when the struct is * allocated, in num_tc. This will be used to navigate the maps, to ensure we're * not crossing its upper bound, as the original dev->num_tc can be updated in * the meantime. */ struct xps_dev_maps { struct rcu_head rcu; unsigned int nr_ids; s16 num_tc; struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */ }; #define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \ (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *))) #define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\ (_rxqs * (_tcs) * sizeof(struct xps_map *))) #endif /* CONFIG_XPS */ #define TC_MAX_QUEUE 16 #define TC_BITMASK 15 /* HW offloaded queuing disciplines txq count and offset maps */ struct netdev_tc_txq { u16 count; u16 offset; }; #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) /* * This structure is to hold information about the device * configured to run FCoE protocol stack. */ struct netdev_fcoe_hbainfo { char manufacturer[64]; char serial_number[64]; char hardware_version[64]; char driver_version[64]; char optionrom_version[64]; char firmware_version[64]; char model[256]; char model_description[256]; }; #endif #define MAX_PHYS_ITEM_ID_LEN 32 /* This structure holds a unique identifier to identify some * physical item (port for example) used by a netdevice. */ struct netdev_phys_item_id { unsigned char id[MAX_PHYS_ITEM_ID_LEN]; unsigned char id_len; }; static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a, struct netdev_phys_item_id *b) { return a->id_len == b->id_len && memcmp(a->id, b->id, a->id_len) == 0; } typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); enum net_device_path_type { DEV_PATH_ETHERNET = 0, DEV_PATH_VLAN, DEV_PATH_BRIDGE, DEV_PATH_PPPOE, DEV_PATH_DSA, DEV_PATH_MTK_WDMA, }; struct net_device_path { enum net_device_path_type type; const struct net_device *dev; union { struct { u16 id; __be16 proto; u8 h_dest[ETH_ALEN]; } encap; struct { enum { DEV_PATH_BR_VLAN_KEEP, DEV_PATH_BR_VLAN_TAG, DEV_PATH_BR_VLAN_UNTAG, DEV_PATH_BR_VLAN_UNTAG_HW, } vlan_mode; u16 vlan_id; __be16 vlan_proto; } bridge; struct { int port; u16 proto; } dsa; struct { u8 wdma_idx; u8 queue; u16 wcid; u8 bss; u8 amsdu; } mtk_wdma; }; }; #define NET_DEVICE_PATH_STACK_MAX 5 #define NET_DEVICE_PATH_VLAN_MAX 2 struct net_device_path_stack { int num_paths; struct net_device_path path[NET_DEVICE_PATH_STACK_MAX]; }; struct net_device_path_ctx { const struct net_device *dev; u8 daddr[ETH_ALEN]; int num_vlans; struct { u16 id; __be16 proto; } vlan[NET_DEVICE_PATH_VLAN_MAX]; }; enum tc_setup_type { TC_QUERY_CAPS, TC_SETUP_QDISC_MQPRIO, TC_SETUP_CLSU32, TC_SETUP_CLSFLOWER, TC_SETUP_CLSMATCHALL, TC_SETUP_CLSBPF, TC_SETUP_BLOCK, TC_SETUP_QDISC_CBS, TC_SETUP_QDISC_RED, TC_SETUP_QDISC_PRIO, TC_SETUP_QDISC_MQ, TC_SETUP_QDISC_ETF, TC_SETUP_ROOT_QDISC, TC_SETUP_QDISC_GRED, TC_SETUP_QDISC_TAPRIO, TC_SETUP_FT, TC_SETUP_QDISC_ETS, TC_SETUP_QDISC_TBF, TC_SETUP_QDISC_FIFO, TC_SETUP_QDISC_HTB, TC_SETUP_ACT, }; /* These structures hold the attributes of bpf state that are being passed * to the netdevice through the bpf op. */ enum bpf_netdev_command { /* Set or clear a bpf program used in the earliest stages of packet * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee * is responsible for calling bpf_prog_put on any old progs that are * stored. In case of error, the callee need not release the new prog * reference, but on success it takes ownership and must bpf_prog_put * when it is no longer used. */ XDP_SETUP_PROG, XDP_SETUP_PROG_HW, /* BPF program for offload callbacks, invoked at program load time. */ BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE, XDP_SETUP_XSK_POOL, }; struct bpf_prog_offload_ops; struct netlink_ext_ack; struct xdp_umem; struct xdp_dev_bulk_queue; struct bpf_xdp_link; enum bpf_xdp_mode { XDP_MODE_SKB = 0, XDP_MODE_DRV = 1, XDP_MODE_HW = 2, __MAX_XDP_MODE }; struct bpf_xdp_entity { struct bpf_prog *prog; struct bpf_xdp_link *link; }; struct netdev_bpf { enum bpf_netdev_command command; union { /* XDP_SETUP_PROG */ struct { u32 flags; struct bpf_prog *prog; struct netlink_ext_ack *extack; }; /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */ struct { struct bpf_offloaded_map *offmap; }; /* XDP_SETUP_XSK_POOL */ struct { struct xsk_buff_pool *pool; u16 queue_id; } xsk; }; }; /* Flags for ndo_xsk_wakeup. */ #define XDP_WAKEUP_RX (1 << 0) #define XDP_WAKEUP_TX (1 << 1) #ifdef CONFIG_XFRM_OFFLOAD struct xfrmdev_ops { int (*xdo_dev_state_add)(struct net_device *dev, struct xfrm_state *x, struct netlink_ext_ack *extack); void (*xdo_dev_state_delete)(struct net_device *dev, struct xfrm_state *x); void (*xdo_dev_state_free)(struct net_device *dev, struct xfrm_state *x); bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); void (*xdo_dev_state_update_stats) (struct xfrm_state *x); int (*xdo_dev_policy_add) (struct xfrm_policy *x, struct netlink_ext_ack *extack); void (*xdo_dev_policy_delete) (struct xfrm_policy *x); void (*xdo_dev_policy_free) (struct xfrm_policy *x); }; #endif struct dev_ifalias { struct rcu_head rcuhead; char ifalias[]; }; struct devlink; struct tlsdev_ops; struct netdev_net_notifier { struct list_head list; struct notifier_block *nb; }; /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are * optional and can be filled with a null pointer. * * int (*ndo_init)(struct net_device *dev); * This function is called once when a network device is registered. * The network device can use this for any late stage initialization * or semantic validation. It can fail with an error code which will * be propagated back to register_netdev. * * void (*ndo_uninit)(struct net_device *dev); * This function is called when device is unregistered or when registration * fails. It is not called if init fails. * * int (*ndo_open)(struct net_device *dev); * This function is called when a network device transitions to the up * state. * * int (*ndo_stop)(struct net_device *dev); * This function is called when a network device transitions to the down * state. * * netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb, * struct net_device *dev); * Called when a packet needs to be transmitted. * Returns NETDEV_TX_OK. Can return NETDEV_TX_BUSY, but you should stop * the queue before that can happen; it's for obsolete devices and weird * corner cases, but the stack really does a non-trivial amount * of useless work if you return NETDEV_TX_BUSY. * Required; cannot be NULL. * * netdev_features_t (*ndo_features_check)(struct sk_buff *skb, * struct net_device *dev * netdev_features_t features); * Called by core transmit path to determine if device is capable of * performing offload operations on a given packet. This is to give * the device an opportunity to implement any restrictions that cannot * be otherwise expressed by feature flags. The check is called with * the set of features that the stack has calculated and it returns * those the driver believes to be appropriate. * * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, * struct net_device *sb_dev); * Called to decide which queue to use when device supports multiple * transmit queues. * * void (*ndo_change_rx_flags)(struct net_device *dev, int flags); * This function is called to allow device receiver to make * changes to configuration when multicast or promiscuous is enabled. * * void (*ndo_set_rx_mode)(struct net_device *dev); * This function is called device changes address list filtering. * If driver handles unicast address filtering, it should set * IFF_UNICAST_FLT in its priv_flags. * * int (*ndo_set_mac_address)(struct net_device *dev, void *addr); * This function is called when the Media Access Control address * needs to be changed. If this interface is not defined, the * MAC address can not be changed. * * int (*ndo_validate_addr)(struct net_device *dev); * Test if Media Access Control address is valid for the device. * * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); * Old-style ioctl entry point. This is used internally by the * ieee802154 subsystem but is no longer called by the device * ioctl handler. * * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd); * Used by the bonding driver for its device specific ioctls: * SIOCBONDENSLAVE, SIOCBONDRELEASE, SIOCBONDSETHWADDR, SIOCBONDCHANGEACTIVE, * SIOCBONDSLAVEINFOQUERY, and SIOCBONDINFOQUERY * * * int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); * Called for ethernet specific ioctls: SIOCGMIIPHY, SIOCGMIIREG, * SIOCSMIIREG, SIOCSHWTSTAMP and SIOCGHWTSTAMP. * * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); * Used to set network devices bus interface parameters. This interface * is retained for legacy reasons; new devices should use the bus * interface (PCI) for low level management. * * int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); * Called when a user wants to change the Maximum Transfer Unit * of a device. * * void (*ndo_tx_timeout)(struct net_device *dev, unsigned int txqueue); * Callback used when the transmitter has not made any progress * for dev->watchdog ticks. * * void (*ndo_get_stats64)(struct net_device *dev, * struct rtnl_link_stats64 *storage); * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); * Called when a user wants to get the network device usage * statistics. Drivers must do one of the following: * 1. Define @ndo_get_stats64 to fill in a zero-initialised * rtnl_link_stats64 structure passed by the caller. * 2. Define @ndo_get_stats to update a net_device_stats structure * (which should normally be dev->stats) and return a pointer to * it. The structure may be changed asynchronously only if each * field is written atomically. * 3. Update dev->stats asynchronously and atomically, and define * neither operation. * * bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id) * Return true if this device supports offload stats of this attr_id. * * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev, * void *attr_data) * Get statistics for offload operations by attr_id. Write it into the * attr_data pointer. * * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid); * If device supports VLAN filtering this function is called when a * VLAN id is registered. * * int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid); * If device supports VLAN filtering this function is called when a * VLAN id is unregistered. * * void (*ndo_poll_controller)(struct net_device *dev); * * SR-IOV management functions. * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac); * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, * u8 qos, __be16 proto); * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate, * int max_tx_rate); * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting); * int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting); * int (*ndo_get_vf_config)(struct net_device *dev, * int vf, struct ifla_vf_info *ivf); * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state); * int (*ndo_set_vf_port)(struct net_device *dev, int vf, * struct nlattr *port[]); * * Enable or disable the VF ability to query its RSS Redirection Table and * Hash Key. This is needed since on some devices VF share this information * with PF and querying it may introduce a theoretical security risk. * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting); * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type, * void *type_data); * Called to setup any 'tc' scheduler, classifier or action on @dev. * This is always called from the stack with the rtnl lock held and netif * tx queues stopped. This allows the netdevice to perform queue * management safely. * * Fiber Channel over Ethernet (FCoE) offload functions. * int (*ndo_fcoe_enable)(struct net_device *dev); * Called when the FCoE protocol stack wants to start using LLD for FCoE * so the underlying device can perform whatever needed configuration or * initialization to support acceleration of FCoE traffic. * * int (*ndo_fcoe_disable)(struct net_device *dev); * Called when the FCoE protocol stack wants to stop using LLD for FCoE * so the underlying device can perform whatever needed clean-ups to * stop supporting acceleration of FCoE traffic. * * int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid, * struct scatterlist *sgl, unsigned int sgc); * Called when the FCoE Initiator wants to initialize an I/O that * is a possible candidate for Direct Data Placement (DDP). The LLD can * perform necessary setup and returns 1 to indicate the device is set up * successfully to perform DDP on this I/O, otherwise this returns 0. * * int (*ndo_fcoe_ddp_done)(struct net_device *dev, u16 xid); * Called when the FCoE Initiator/Target is done with the DDPed I/O as * indicated by the FC exchange id 'xid', so the underlying device can * clean up and reuse resources for later DDP requests. * * int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid, * struct scatterlist *sgl, unsigned int sgc); * Called when the FCoE Target wants to initialize an I/O that * is a possible candidate for Direct Data Placement (DDP). The LLD can * perform necessary setup and returns 1 to indicate the device is set up * successfully to perform DDP on this I/O, otherwise this returns 0. * * int (*ndo_fcoe_get_hbainfo)(struct net_device *dev, * struct netdev_fcoe_hbainfo *hbainfo); * Called when the FCoE Protocol stack wants information on the underlying * device. This information is utilized by the FCoE protocol stack to * register attributes with Fiber Channel management service as per the * FC-GS Fabric Device Management Information(FDMI) specification. * * int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type); * Called when the underlying device wants to override default World Wide * Name (WWN) generation mechanism in FCoE protocol stack to pass its own * World Wide Port Name (WWPN) or World Wide Node Name (WWNN) to the FCoE * protocol stack to use. * * RFS acceleration. * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb, * u16 rxq_index, u32 flow_id); * Set hardware filter for RFS. rxq_index is the target queue index; * flow_id is a flow ID to be passed to rps_may_expire_flow() later. * Return the filter ID on success, or a negative error code. * * Slave management functions (for bridge, bonding, etc). * int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to make another netdev an underling. * * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to release previously enslaved netdev. * * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev, * struct sk_buff *skb, * bool all_slaves); * Get the xmit slave of master device. If all_slaves is true, function * assume all the slaves can transmit. * * Feature/offload setting functions. * netdev_features_t (*ndo_fix_features)(struct net_device *dev, * netdev_features_t features); * Adjusts the requested feature flags according to device-specific * constraints, and returns the resulting flags. Must not modify * the device state. * * int (*ndo_set_features)(struct net_device *dev, netdev_features_t features); * Called to update device configuration to new features. Passed * feature set might be less than what was returned by ndo_fix_features()). * Must return >0 or -errno if it changed dev->features itself. * * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, * const unsigned char *addr, u16 vid, u16 flags, * bool *notified, struct netlink_ext_ack *extack); * Adds an FDB entry to dev for addr. * Callee shall set *notified to true if it sent any appropriate * notification(s). Otherwise core will send a generic one. * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], * struct net_device *dev, * const unsigned char *addr, u16 vid * bool *notified, struct netlink_ext_ack *extack); * Deletes the FDB entry from dev corresponding to addr. * Callee shall set *notified to true if it sent any appropriate * notification(s). Otherwise core will send a generic one. * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev, * struct netlink_ext_ack *extack); * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, * struct net_device *dev, struct net_device *filter_dev, * int *idx) * Used to add FDB entries to dump requests. Implementers should add * entries to skb and update idx with the number of entries. * * int (*ndo_mdb_add)(struct net_device *dev, struct nlattr *tb[], * u16 nlmsg_flags, struct netlink_ext_ack *extack); * Adds an MDB entry to dev. * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[], * struct netlink_ext_ack *extack); * Deletes the MDB entry from dev. * int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[], * struct netlink_ext_ack *extack); * Bulk deletes MDB entries from dev. * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb, * struct netlink_callback *cb); * Dumps MDB entries from dev. The first argument (marker) in the netlink * callback is used by core rtnetlink code. * * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh, * u16 flags, struct netlink_ext_ack *extack) * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq, * struct net_device *dev, u32 filter_mask, * int nlflags) * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh, * u16 flags); * * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier); * Called to change device carrier. Soft-devices (like dummy, team, etc) * which do not represent real hardware may define this to allow their * userspace components to manage their virtual carrier state. Devices * that determine carrier state from physical hardware properties (eg * network cables) or protocol-dependent mechanisms (eg * USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function. * * int (*ndo_get_phys_port_id)(struct net_device *dev, * struct netdev_phys_item_id *ppid); * Called to get ID of physical port of this device. If driver does * not implement this, it is assumed that the hw is not able to have * multiple net devices on single physical port. * * int (*ndo_get_port_parent_id)(struct net_device *dev, * struct netdev_phys_item_id *ppid) * Called to get the parent ID of the physical port of this device. * * void* (*ndo_dfwd_add_station)(struct net_device *pdev, * struct net_device *dev) * Called by upper layer devices to accelerate switching or other * station functionality into hardware. 'pdev is the lowerdev * to use for the offload and 'dev' is the net device that will * back the offload. Returns a pointer to the private structure * the upper layer will maintain. * void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv) * Called by upper layer device to delete the station created * by 'ndo_dfwd_add_station'. 'pdev' is the net device backing * the station and priv is the structure returned by the add * operation. * int (*ndo_set_tx_maxrate)(struct net_device *dev, * int queue_index, u32 maxrate); * Called when a user wants to set a max-rate limitation of specific * TX queue. * int (*ndo_get_iflink)(const struct net_device *dev); * Called to get the iflink value of this device. * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); * This function is used to get egress tunnel information for given skb. * This is useful for retrieving outer tunnel header parameters while * sampling packet. * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom); * This function is used to specify the headroom that the skb must * consider when allocation skb during packet reception. Setting * appropriate rx headroom value allows avoiding skb head copy on * forward. Setting a negative value resets the rx headroom to the * default value. * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); * This function is used to set or query state related to XDP on the * netdevice and manage BPF offload. See definition of * enum bpf_netdev_command for details. * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, * u32 flags); * This function is used to submit @n XDP packets for transmit on a * netdevice. Returns number of frames successfully transmitted, frames * that got dropped are freed/returned via xdp_return_frame(). * Returns negative number, means general error invoking ndo, meaning * no frames were xmit'ed and core-caller will free all frames. * struct net_device *(*ndo_xdp_get_xmit_slave)(struct net_device *dev, * struct xdp_buff *xdp); * Get the xmit slave of master device based on the xdp_buff. * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); * This function is used to wake up the softirq, ksoftirqd or kthread * responsible for sending and/or receiving packets on a specific * queue id bound to an AF_XDP socket. The flags field specifies if * only RX, only Tx, or both should be woken up using the flags * XDP_WAKEUP_RX and XDP_WAKEUP_TX. * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm_kern *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev); * If a device is paired with a peer device, return the peer instance. * The caller must be under RCU read context. * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); * Get the forwarding path to reach the real device from the HW destination address * ktime_t (*ndo_get_tstamp)(struct net_device *dev, * const struct skb_shared_hwtstamps *hwtstamps, * bool cycles); * Get hardware timestamp based on normal/adjustable time or free running * cycle counter. This function is required if physical clock supports a * free running cycle counter. * * int (*ndo_hwtstamp_get)(struct net_device *dev, * struct kernel_hwtstamp_config *kernel_config); * Get the currently configured hardware timestamping parameters for the * NIC device. * * int (*ndo_hwtstamp_set)(struct net_device *dev, * struct kernel_hwtstamp_config *kernel_config, * struct netlink_ext_ack *extack); * Change the hardware timestamping parameters for NIC device. */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); void (*ndo_uninit)(struct net_device *dev); int (*ndo_open)(struct net_device *dev); int (*ndo_stop)(struct net_device *dev); netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb, struct net_device *dev); netdev_features_t (*ndo_features_check)(struct sk_buff *skb, struct net_device *dev, netdev_features_t features); u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); void (*ndo_change_rx_flags)(struct net_device *dev, int flags); void (*ndo_set_rx_mode)(struct net_device *dev); int (*ndo_set_mac_address)(struct net_device *dev, void *addr); int (*ndo_validate_addr)(struct net_device *dev); int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd); int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd); int (*ndo_siocwandev)(struct net_device *dev, struct if_settings *ifs); int (*ndo_siocdevprivate)(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd); int (*ndo_set_config)(struct net_device *dev, struct ifmap *map); int (*ndo_change_mtu)(struct net_device *dev, int new_mtu); int (*ndo_neigh_setup)(struct net_device *dev, struct neigh_parms *); void (*ndo_tx_timeout) (struct net_device *dev, unsigned int txqueue); void (*ndo_get_stats64)(struct net_device *dev, struct rtnl_link_stats64 *storage); bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id); int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev, void *attr_data); struct net_device_stats* (*ndo_get_stats)(struct net_device *dev); int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid); int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid); #ifdef CONFIG_NET_POLL_CONTROLLER void (*ndo_poll_controller)(struct net_device *dev); int (*ndo_netpoll_setup)(struct net_device *dev); void (*ndo_netpoll_cleanup)(struct net_device *dev); #endif int (*ndo_set_vf_mac)(struct net_device *dev, int queue, u8 *mac); int (*ndo_set_vf_vlan)(struct net_device *dev, int queue, u16 vlan, u8 qos, __be16 proto); int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate, int max_tx_rate); int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting); int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting); int (*ndo_get_vf_config)(struct net_device *dev, int vf, struct ifla_vf_info *ivf); int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state); int (*ndo_get_vf_stats)(struct net_device *dev, int vf, struct ifla_vf_stats *vf_stats); int (*ndo_set_vf_port)(struct net_device *dev, int vf, struct nlattr *port[]); int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); int (*ndo_get_vf_guid)(struct net_device *dev, int vf, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid); int (*ndo_set_vf_guid)(struct net_device *dev, int vf, u64 guid, int guid_type); int (*ndo_set_vf_rss_query_en)( struct net_device *dev, int vf, bool setting); int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type, void *type_data); #if IS_ENABLED(CONFIG_FCOE) int (*ndo_fcoe_enable)(struct net_device *dev); int (*ndo_fcoe_disable)(struct net_device *dev); int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc); int (*ndo_fcoe_ddp_done)(struct net_device *dev, u16 xid); int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid, struct scatterlist *sgl, unsigned int sgc); int (*ndo_fcoe_get_hbainfo)(struct net_device *dev, struct netdev_fcoe_hbainfo *hbainfo); #endif #if IS_ENABLED(CONFIG_LIBFCOE) #define NETDEV_FCOE_WWNN 0 #define NETDEV_FCOE_WWPN 1 int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type); #endif #ifdef CONFIG_RFS_ACCEL int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb, u16 rxq_index, u32 flow_id); #endif int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev, struct netlink_ext_ack *extack); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); struct net_device* (*ndo_get_xmit_slave)(struct net_device *dev, struct sk_buff *skb, bool all_slaves); struct net_device* (*ndo_sk_get_lower_dev)(struct net_device *dev, struct sock *sk); netdev_features_t (*ndo_fix_features)(struct net_device *dev, netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, netdev_features_t features); int (*ndo_neigh_construct)(struct net_device *dev, struct neighbour *n); void (*ndo_neigh_destroy)(struct net_device *dev, struct neighbour *n); int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags, bool *notified, struct netlink_ext_ack *extack); int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, bool *notified, struct netlink_ext_ack *extack); int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev, struct netlink_ext_ack *extack); int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx); int (*ndo_fdb_get)(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u32 portid, u32 seq, struct netlink_ext_ack *extack); int (*ndo_mdb_add)(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags, struct netlink_ext_ack *extack); int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack); int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb); int (*ndo_mdb_get)(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq, struct netlink_ext_ack *extack); int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh, u16 flags, struct netlink_ext_ack *extack); int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u32 filter_mask, int nlflags); int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh, u16 flags); int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier); int (*ndo_get_phys_port_id)(struct net_device *dev, struct netdev_phys_item_id *ppid); int (*ndo_get_port_parent_id)(struct net_device *dev, struct netdev_phys_item_id *ppid); int (*ndo_get_phys_port_name)(struct net_device *dev, char *name, size_t len); void* (*ndo_dfwd_add_station)(struct net_device *pdev, struct net_device *dev); void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv); int (*ndo_set_tx_maxrate)(struct net_device *dev, int queue_index, u32 maxrate); int (*ndo_get_iflink)(const struct net_device *dev); int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom); int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf); int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp, u32 flags); struct net_device * (*ndo_xdp_get_xmit_slave)(struct net_device *dev, struct xdp_buff *xdp); int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd); struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path); ktime_t (*ndo_get_tstamp)(struct net_device *dev, const struct skb_shared_hwtstamps *hwtstamps, bool cycles); int (*ndo_hwtstamp_get)(struct net_device *dev, struct kernel_hwtstamp_config *kernel_config); int (*ndo_hwtstamp_set)(struct net_device *dev, struct kernel_hwtstamp_config *kernel_config, struct netlink_ext_ack *extack); #if IS_ENABLED(CONFIG_NET_SHAPER) /** * @net_shaper_ops: Device shaping offload operations * see include/net/net_shapers.h */ const struct net_shaper_ops *net_shaper_ops; #endif }; /** * enum netdev_priv_flags - &struct net_device priv_flags * * These are the &struct net_device, they are only set internally * by drivers and used in the kernel. These flags are invisible to * userspace; this means that the order of these flags can change * during any kernel release. * * You should add bitfield booleans after either net_device::priv_flags * (hotpath) or ::threaded (slowpath) instead of extending these flags. * * @IFF_802_1Q_VLAN: 802.1Q VLAN device * @IFF_EBRIDGE: Ethernet bridging device * @IFF_BONDING: bonding master or slave * @IFF_ISATAP: ISATAP interface (RFC4214) * @IFF_WAN_HDLC: WAN HDLC device * @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to * release skb->dst * @IFF_DONT_BRIDGE: disallow bridging this ether dev * @IFF_DISABLE_NETPOLL: disable netpoll at run-time * @IFF_MACVLAN_PORT: device used as macvlan port * @IFF_BRIDGE_PORT: device used as bridge port * @IFF_OVS_DATAPATH: device used as Open vSwitch datapath port * @IFF_TX_SKB_SHARING: The interface supports sharing skbs on transmit * @IFF_UNICAST_FLT: Supports unicast filtering * @IFF_TEAM_PORT: device used as team port * @IFF_SUPP_NOFCS: device supports sending custom FCS * @IFF_LIVE_ADDR_CHANGE: device supports hardware address * change when it's running * @IFF_MACVLAN: Macvlan device * @IFF_XMIT_DST_RELEASE_PERM: IFF_XMIT_DST_RELEASE not taking into account * underlying stacked devices * @IFF_L3MDEV_MASTER: device is an L3 master device * @IFF_NO_QUEUE: device can run without qdisc attached * @IFF_OPENVSWITCH: device is a Open vSwitch master * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device * @IFF_TEAM: device is a team device * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external * entity (i.e. the master device for bridged veth) * @IFF_MACSEC: device is a MACsec device * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook * @IFF_FAILOVER: device is a failover master device * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device * @IFF_NO_ADDRCONF: prevent ipv6 addrconf * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, IFF_EBRIDGE = 1<<1, IFF_BONDING = 1<<2, IFF_ISATAP = 1<<3, IFF_WAN_HDLC = 1<<4, IFF_XMIT_DST_RELEASE = 1<<5, IFF_DONT_BRIDGE = 1<<6, IFF_DISABLE_NETPOLL = 1<<7, IFF_MACVLAN_PORT = 1<<8, IFF_BRIDGE_PORT = 1<<9, IFF_OVS_DATAPATH = 1<<10, IFF_TX_SKB_SHARING = 1<<11, IFF_UNICAST_FLT = 1<<12, IFF_TEAM_PORT = 1<<13, IFF_SUPP_NOFCS = 1<<14, IFF_LIVE_ADDR_CHANGE = 1<<15, IFF_MACVLAN = 1<<16, IFF_XMIT_DST_RELEASE_PERM = 1<<17, IFF_L3MDEV_MASTER = 1<<18, IFF_NO_QUEUE = 1<<19, IFF_OPENVSWITCH = 1<<20, IFF_L3MDEV_SLAVE = 1<<21, IFF_TEAM = 1<<22, IFF_RXFH_CONFIGURED = 1<<23, IFF_PHONY_HEADROOM = 1<<24, IFF_MACSEC = 1<<25, IFF_NO_RX_HANDLER = 1<<26, IFF_FAILOVER = 1<<27, IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, IFF_NO_ADDRCONF = BIT_ULL(30), IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), }; /* Specifies the type of the struct net_device::ml_priv pointer */ enum netdev_ml_priv_type { ML_PRIV_NONE, ML_PRIV_CAN, }; enum netdev_stat_type { NETDEV_PCPU_STAT_NONE, NETDEV_PCPU_STAT_LSTATS, /* struct pcpu_lstats */ NETDEV_PCPU_STAT_TSTATS, /* struct pcpu_sw_netstats */ NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */ }; enum netdev_reg_state { NETREG_UNINITIALIZED = 0, NETREG_REGISTERED, /* completed register_netdevice */ NETREG_UNREGISTERING, /* called unregister_netdevice */ NETREG_UNREGISTERED, /* completed unregister todo */ NETREG_RELEASED, /* called free_netdev */ NETREG_DUMMY, /* dummy device for NAPI poll */ }; /** * struct net_device - The DEVICE structure. * * Actually, this whole structure is a big mistake. It mixes I/O * data with strictly "high-level" data, and it has to know about * almost every data structure used in the INET module. * * @priv_flags: flags invisible to userspace defined as bits, see * enum netdev_priv_flags for the definitions * @lltx: device supports lockless Tx. Deprecated for real HW * drivers. Mainly used by logical interfaces, such as * bonding and tunnels * @netmem_tx: device support netmem_tx. * * @name: This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name * of the interface. * * @name_node: Name hashlist node * @ifalias: SNMP alias * @mem_end: Shared memory end * @mem_start: Shared memory start * @base_addr: Device I/O address * @irq: Device IRQ number * * @state: Generic network queuing layer state, see netdev_state_t * @dev_list: The global list of network devices * @napi_list: List entry used for polling NAPI devices * @unreg_list: List entry when we are unregistering the * device; see the function unregister_netdev * @close_list: List entry used when we are closing the device * @ptype_all: Device-specific packet handlers for all protocols * @ptype_specific: Device-specific, protocol-specific packet handlers * * @adj_list: Directly linked devices, like slaves for bonding * @features: Currently active device features * @hw_features: User-changeable features * * @wanted_features: User-requested features * @vlan_features: Mask of features inheritable by VLAN devices * * @hw_enc_features: Mask of features inherited by encapsulating devices * This field indicates what encapsulation * offloads the hardware is capable of doing, * and drivers will need to set them appropriately. * * @mpls_features: Mask of features inheritable by MPLS * @gso_partial_features: value(s) from NETIF_F_GSO\* * * @ifindex: interface index * @group: The group the device belongs to * * @stats: Statistics struct, which was left as a legacy, use * rtnl_link_stats64 instead * * @core_stats: core networking counters, * do not use this in drivers * @carrier_up_count: Number of times the carrier has been up * @carrier_down_count: Number of times the carrier has been down * * @wireless_handlers: List of functions to handle Wireless Extensions, * instead of ioctl, * see <net/iw_handler.h> for details. * * @netdev_ops: Includes several pointers to callbacks, * if one wants to override the ndo_*() functions * @xdp_metadata_ops: Includes pointers to XDP metadata callbacks. * @xsk_tx_metadata_ops: Includes pointers to AF_XDP TX metadata callbacks. * @ethtool_ops: Management operations * @l3mdev_ops: Layer 3 master device operations * @ndisc_ops: Includes callbacks for different IPv6 neighbour * discovery handling. Necessary for e.g. 6LoWPAN. * @xfrmdev_ops: Transformation offload operations * @tlsdev_ops: Transport Layer Security offload operations * @header_ops: Includes callbacks for creating,parsing,caching,etc * of Layer 2 headers. * * @flags: Interface flags (a la BSD) * @xdp_features: XDP capability supported by the device * @gflags: Global flags ( kept as legacy ) * @priv_len: Size of the ->priv flexible array * @priv: Flexible array containing private data * @operstate: RFC2863 operstate * @link_mode: Mapping policy to operstate * @if_port: Selectable AUI, TP, ... * @dma: DMA channel * @mtu: Interface MTU value * @min_mtu: Interface Minimum MTU value * @max_mtu: Interface Maximum MTU value * @type: Interface hardware type * @hard_header_len: Maximum hardware header length. * @min_header_len: Minimum hardware header length * * @needed_headroom: Extra headroom the hardware may need, but not in all * cases can this be guaranteed * @needed_tailroom: Extra tailroom the hardware may need, but not in all * cases can this be guaranteed. Some cases also use * LL_MAX_HEADER instead to allocate the skb * * interface address info: * * @perm_addr: Permanent hw address * @addr_assign_type: Hw address assignment type * @addr_len: Hardware address length * @upper_level: Maximum depth level of upper devices. * @lower_level: Maximum depth level of lower devices. * @threaded: napi threaded state. * @neigh_priv_len: Used in neigh_alloc() * @dev_id: Used to differentiate devices that share * the same link layer address * @dev_port: Used to differentiate devices that share * the same function * @addr_list_lock: XXX: need comments on this one * @name_assign_type: network interface name assignment type * @uc_promisc: Counter that indicates promiscuous mode * has been enabled due to the need to listen to * additional unicast addresses in a device that * does not implement ndo_set_rx_mode() * @uc: unicast mac addresses * @mc: multicast mac addresses * @dev_addrs: list of device hw addresses * @queues_kset: Group of all Kobjects in the Tx and RX queues * @promiscuity: Number of times the NIC is told to work in * promiscuous mode; if it becomes 0 the NIC will * exit promiscuous mode * @allmulti: Counter, enables or disables allmulticast mode * * @vlan_info: VLAN info * @dsa_ptr: dsa specific data * @tipc_ptr: TIPC specific data * @atalk_ptr: AppleTalk link * @ip_ptr: IPv4 specific data * @ip6_ptr: IPv6 specific data * @ax25_ptr: AX.25 specific data * @ieee80211_ptr: IEEE 802.11 specific data, assign before registering * @ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network * device struct * @mpls_ptr: mpls_dev struct pointer * @mctp_ptr: MCTP specific data * @psp_dev: PSP crypto device registered for this netdev * * @dev_addr: Hw address (before bcast, * because most packets are unicast) * * @_rx: Array of RX queues * @num_rx_queues: Number of RX queues * allocated at register_netdev() time * @real_num_rx_queues: Number of RX queues currently active in device * @xdp_prog: XDP sockets filter program pointer * * @rx_handler: handler for received packets * @rx_handler_data: XXX: need comments on this one * @tcx_ingress: BPF & clsact qdisc specific data for ingress processing * @ingress_queue: XXX: need comments on this one * @nf_hooks_ingress: netfilter hooks executed for ingress packets * @broadcast: hw bcast address * * @rx_cpu_rmap: CPU reverse-mapping for RX completion interrupts, * indexed by RX queue number. Assigned by driver. * This must only be set if the ndo_rx_flow_steer * operation is defined * @index_hlist: Device index hash chain * * @_tx: Array of TX queues * @num_tx_queues: Number of TX queues allocated at alloc_netdev_mq() time * @real_num_tx_queues: Number of TX queues currently active in device * @qdisc: Root qdisc from userspace point of view * @tx_queue_len: Max frames per queue allowed * @tx_global_lock: XXX: need comments on this one * @xdp_bulkq: XDP device bulk queue * @xps_maps: all CPUs/RXQs maps for XPS device * * @xps_maps: XXX: need comments on this one * @tcx_egress: BPF & clsact qdisc specific data for egress processing * @nf_hooks_egress: netfilter hooks executed for egress packets * @qdisc_hash: qdisc hash table * @watchdog_timeo: Represents the timeout that is used by * the watchdog (see dev_watchdog()) * @watchdog_timer: List of timers * * @proto_down_reason: reason a netdev interface is held down * @pcpu_refcnt: Number of references to this device * @dev_refcnt: Number of references to this device * @refcnt_tracker: Tracker directory for tracked references to this device * @todo_list: Delayed register/unregister * @link_watch_list: XXX: need comments on this one * * @reg_state: Register/unregister state machine * @dismantle: Device is going to be freed * @needs_free_netdev: Should unregister perform free_netdev? * @priv_destructor: Called from unregister * @npinfo: XXX: need comments on this one * @nd_net: Network namespace this network device is inside * protected by @lock * * @ml_priv: Mid-layer private * @ml_priv_type: Mid-layer private type * * @pcpu_stat_type: Type of device statistics which the core should * allocate/free: none, lstats, tstats, dstats. none * means the driver is handling statistics allocation/ * freeing internally. * @lstats: Loopback statistics: packets, bytes * @tstats: Tunnel statistics: RX/TX packets, RX/TX bytes * @dstats: Dummy statistics: RX/TX/drop packets, RX/TX bytes * * @garp_port: GARP * @mrp_port: MRP * * @dm_private: Drop monitor private * * @dev: Class/net/name entry * @sysfs_groups: Space for optional device, statistics and wireless * sysfs groups * * @sysfs_rx_queue_group: Space for optional per-rx queue attributes * @rtnl_link_ops: Rtnl_link_ops * @stat_ops: Optional ops for queue-aware statistics * @queue_mgmt_ops: Optional ops for queue management * * @gso_max_size: Maximum size of generic segmentation offload * @tso_max_size: Device (as in HW) limit on the max TSO request size * @gso_max_segs: Maximum number of segments that can be passed to the * NIC for GSO * @tso_max_segs: Device (as in HW) limit on the max TSO segment count * @gso_ipv4_max_size: Maximum size of generic segmentation offload, * for IPv4. * * @dcbnl_ops: Data Center Bridging netlink ops * @num_tc: Number of traffic classes in the net device * @tc_to_txq: XXX: need comments on this one * @prio_tc_map: XXX: need comments on this one * * @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp * * @priomap: XXX: need comments on this one * @link_topo: Physical link topology tracking attached PHYs * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. * * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock * * @proto_down: protocol port state information can be sent to the * switch driver and used to set the phys state of the * switch port. * * @irq_affinity_auto: driver wants the core to store and re-assign the IRQ * affinity. Set by netif_enable_irq_affinity(), then * the driver must create a persistent napi by * netif_napi_add_config() and finally bind the napi to * IRQ (via netif_napi_set_irq()). * * @rx_cpu_rmap_auto: driver wants the core to manage the ARFS rmap. * Set by calling netif_enable_cpu_rmap(). * * @see_all_hwtstamp_requests: device wants to see calls to * ndo_hwtstamp_set() for all timestamp requests * regardless of source, even if those aren't * HWTSTAMP_SOURCE_NETDEV * @change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN * @netns_immutable: interface can't change network namespaces * @fcoe_mtu: device supports maximum FCoE MTU, 2158 bytes * * @net_notifier_list: List of per-net netdev notifier block * that follow this device when it is moved * to another network namespace. * * @macsec_ops: MACsec offloading ops * * @udp_tunnel_nic_info: static structure describing the UDP tunnel * offload capabilities of the device * @udp_tunnel_nic: UDP tunnel offload state * @ethtool: ethtool related state * @xdp_state: stores info on attached XDP BPF programs * * @nested_level: Used as a parameter of spin_lock_nested() of * dev->addr_list_lock. * @unlink_list: As netif_addr_lock() can be called recursively, * keep a list of interfaces to be deleted. * @gro_max_size: Maximum size of aggregated packet in generic * receive offload (GRO) * @gro_ipv4_max_size: Maximum size of aggregated packet in generic * receive offload (GRO), for IPv4. * @xdp_zc_max_segs: Maximum number of segments supported by AF_XDP * zero copy driver * * @dev_addr_shadow: Copy of @dev_addr to catch direct writes. * @linkwatch_dev_tracker: refcount tracker used by linkwatch. * @watchdog_dev_tracker: refcount tracker used by watchdog. * @dev_registered_tracker: tracker for reference held while * registered * @offload_xstats_l3: L3 HW stats for this netdevice. * * @devlink_port: Pointer to related devlink port structure. * Assigned by a driver before netdev registration using * SET_NETDEV_DEVLINK_PORT macro. This pointer is static * during the time netdevice is registered. * * @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem, * where the clock is recovered. * * @max_pacing_offload_horizon: max EDT offload horizon in nsec. * @napi_config: An array of napi_config structures containing per-NAPI * settings. * @num_napi_configs: number of allocated NAPI config structs, * always >= max(num_rx_queues, num_tx_queues). * @gro_flush_timeout: timeout for GRO layer in NAPI * @napi_defer_hard_irqs: If not zero, provides a counter that would * allow to avoid NIC hard IRQ, on busy queues. * * @neighbours: List heads pointing to this device's neighbours' * dev_list, one per address-family. * @hwprov: Tracks which PTP performs hardware packet time stamping. * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ struct net_device { /* Cacheline organization can be found documented in * Documentation/networking/net_cachelines/net_device.rst. * Please update the document when adding new fields. */ /* TX read-mostly hotpath */ __cacheline_group_begin(net_device_read_tx); struct_group(priv_flags_fast, unsigned long priv_flags:32; unsigned long lltx:1; unsigned long netmem_tx:1; ); const struct net_device_ops *netdev_ops; const struct header_ops *header_ops; struct netdev_queue *_tx; netdev_features_t gso_partial_features; unsigned int real_num_tx_queues; unsigned int gso_max_size; unsigned int gso_ipv4_max_size; u16 gso_max_segs; s16 num_tc; /* Note : dev->mtu is often read without holding a lock. * Writers usually hold RTNL. * It is recommended to use READ_ONCE() to annotate the reads, * and to use WRITE_ONCE() to annotate the writes. */ unsigned int mtu; unsigned short needed_headroom; struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; #ifdef CONFIG_XPS struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX]; #endif #ifdef CONFIG_NETFILTER_EGRESS struct nf_hook_entries __rcu *nf_hooks_egress; #endif #ifdef CONFIG_NET_XGRESS struct bpf_mprog_entry __rcu *tcx_egress; #endif __cacheline_group_end(net_device_read_tx); /* TXRX read-mostly hotpath */ __cacheline_group_begin(net_device_read_txrx); union { struct pcpu_lstats __percpu *lstats; struct pcpu_sw_netstats __percpu *tstats; struct pcpu_dstats __percpu *dstats; }; unsigned long state; unsigned int flags; unsigned short hard_header_len; netdev_features_t features; struct inet6_dev __rcu *ip6_ptr; __cacheline_group_end(net_device_read_txrx); /* RX read-mostly hotpath */ __cacheline_group_begin(net_device_read_rx); struct bpf_prog __rcu *xdp_prog; struct list_head ptype_specific; int ifindex; unsigned int real_num_rx_queues; struct netdev_rx_queue *_rx; unsigned int gro_max_size; unsigned int gro_ipv4_max_size; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; possible_net_t nd_net; #ifdef CONFIG_NETPOLL struct netpoll_info __rcu *npinfo; #endif #ifdef CONFIG_NET_XGRESS struct bpf_mprog_entry __rcu *tcx_ingress; #endif __cacheline_group_end(net_device_read_rx); char name[IFNAMSIZ]; struct netdev_name_node *name_node; struct dev_ifalias __rcu *ifalias; /* * I/O specific fields * FIXME: Merge these and struct ifmap into one */ unsigned long mem_end; unsigned long mem_start; unsigned long base_addr; /* * Some hardware also needs these fields (state,dev_list, * napi_list,unreg_list,close_list) but they are not * part of the usual set specified in Space.c. */ struct list_head dev_list; struct list_head napi_list; struct list_head unreg_list; struct list_head close_list; struct list_head ptype_all; struct { struct list_head upper; struct list_head lower; } adj_list; /* Read-mostly cache-line for fast-path access */ xdp_features_t xdp_features; const struct xdp_metadata_ops *xdp_metadata_ops; const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops; unsigned short gflags; unsigned short needed_tailroom; netdev_features_t hw_features; netdev_features_t wanted_features; netdev_features_t vlan_features; netdev_features_t hw_enc_features; netdev_features_t mpls_features; unsigned int min_mtu; unsigned int max_mtu; unsigned short type; unsigned char min_header_len; unsigned char name_assign_type; int group; struct net_device_stats stats; /* not used by modern drivers */ struct net_device_core_stats __percpu *core_stats; /* Stats to monitor link on/off, flapping */ atomic_t carrier_up_count; atomic_t carrier_down_count; #ifdef CONFIG_WIRELESS_EXT const struct iw_handler_def *wireless_handlers; #endif const struct ethtool_ops *ethtool_ops; #ifdef CONFIG_NET_L3_MASTER_DEV const struct l3mdev_ops *l3mdev_ops; #endif #if IS_ENABLED(CONFIG_IPV6) const struct ndisc_ops *ndisc_ops; #endif #ifdef CONFIG_XFRM_OFFLOAD const struct xfrmdev_ops *xfrmdev_ops; #endif #if IS_ENABLED(CONFIG_TLS_DEVICE) const struct tlsdev_ops *tlsdev_ops; #endif unsigned int operstate; unsigned char link_mode; unsigned char if_port; unsigned char dma; /* Interface address info. */ unsigned char perm_addr[MAX_ADDR_LEN]; unsigned char addr_assign_type; unsigned char addr_len; unsigned char upper_level; unsigned char lower_level; u8 threaded; unsigned short neigh_priv_len; unsigned short dev_id; unsigned short dev_port; int irq; u32 priv_len; spinlock_t addr_list_lock; struct netdev_hw_addr_list uc; struct netdev_hw_addr_list mc; struct netdev_hw_addr_list dev_addrs; #ifdef CONFIG_SYSFS struct kset *queues_kset; #endif #ifdef CONFIG_LOCKDEP struct list_head unlink_list; #endif unsigned int promiscuity; unsigned int allmulti; bool uc_promisc; #ifdef CONFIG_LOCKDEP unsigned char nested_level; #endif /* Protocol-specific pointers */ struct in_device __rcu *ip_ptr; /** @fib_nh_head: nexthops associated with this netdev */ struct hlist_head fib_nh_head; #if IS_ENABLED(CONFIG_VLAN_8021Q) struct vlan_info __rcu *vlan_info; #endif #if IS_ENABLED(CONFIG_NET_DSA) struct dsa_port *dsa_ptr; #endif #if IS_ENABLED(CONFIG_TIPC) struct tipc_bearer __rcu *tipc_ptr; #endif #if IS_ENABLED(CONFIG_ATALK) void *atalk_ptr; #endif #if IS_ENABLED(CONFIG_AX25) struct ax25_dev __rcu *ax25_ptr; #endif #if IS_ENABLED(CONFIG_CFG80211) struct wireless_dev *ieee80211_ptr; #endif #if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN) struct wpan_dev *ieee802154_ptr; #endif #if IS_ENABLED(CONFIG_MPLS_ROUTING) struct mpls_dev __rcu *mpls_ptr; #endif #if IS_ENABLED(CONFIG_MCTP) struct mctp_dev __rcu *mctp_ptr; #endif #if IS_ENABLED(CONFIG_INET_PSP) struct psp_dev __rcu *psp_dev; #endif /* * Cache lines mostly used on receive path (including eth_type_trans()) */ /* Interface address info used in eth_type_trans() */ const unsigned char *dev_addr; unsigned int num_rx_queues; #define GRO_LEGACY_MAX_SIZE 65536u /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), * and shinfo->gso_segs is a 16bit field. */ #define GRO_MAX_SIZE (8 * 65535u) unsigned int xdp_zc_max_segs; struct netdev_queue __rcu *ingress_queue; #ifdef CONFIG_NETFILTER_INGRESS struct nf_hook_entries __rcu *nf_hooks_ingress; #endif unsigned char broadcast[MAX_ADDR_LEN]; #ifdef CONFIG_RFS_ACCEL struct cpu_rmap *rx_cpu_rmap; #endif struct hlist_node index_hlist; /* * Cache lines mostly used on transmit path */ unsigned int num_tx_queues; struct Qdisc __rcu *qdisc; unsigned int tx_queue_len; spinlock_t tx_global_lock; struct xdp_dev_bulk_queue __percpu *xdp_bulkq; #ifdef CONFIG_NET_SCHED DECLARE_HASHTABLE (qdisc_hash, 4); #endif /* These may be needed for future network-power-down code. */ struct timer_list watchdog_timer; int watchdog_timeo; u32 proto_down_reason; struct list_head todo_list; #ifdef CONFIG_PCPU_DEV_REFCNT int __percpu *pcpu_refcnt; #else refcount_t dev_refcnt; #endif struct ref_tracker_dir refcnt_tracker; struct list_head link_watch_list; u8 reg_state; bool dismantle; /** @moving_ns: device is changing netns, protected by @lock */ bool moving_ns; /** @rtnl_link_initializing: Device being created, suppress events */ bool rtnl_link_initializing; bool needs_free_netdev; void (*priv_destructor)(struct net_device *dev); /* mid-layer private */ void *ml_priv; enum netdev_ml_priv_type ml_priv_type; enum netdev_stat_type pcpu_stat_type:8; #if IS_ENABLED(CONFIG_GARP) struct garp_port __rcu *garp_port; #endif #if IS_ENABLED(CONFIG_MRP) struct mrp_port __rcu *mrp_port; #endif #if IS_ENABLED(CONFIG_NET_DROP_MONITOR) struct dm_hw_stat_delta __rcu *dm_private; #endif struct device dev; const struct attribute_group *sysfs_groups[5]; const struct attribute_group *sysfs_rx_queue_group; const struct rtnl_link_ops *rtnl_link_ops; const struct netdev_stat_ops *stat_ops; const struct netdev_queue_mgmt_ops *queue_mgmt_ops; /* for setting kernel sock attribute on TCP connection setup */ #define GSO_MAX_SEGS 65535u #define GSO_LEGACY_MAX_SIZE 65536u /* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE), * and shinfo->gso_segs is a 16bit field. */ #define GSO_MAX_SIZE (8 * GSO_MAX_SEGS) #define TSO_LEGACY_MAX_SIZE 65536 #define TSO_MAX_SIZE UINT_MAX unsigned int tso_max_size; #define TSO_MAX_SEGS U16_MAX u16 tso_max_segs; #ifdef CONFIG_DCB const struct dcbnl_rtnl_ops *dcbnl_ops; #endif u8 prio_tc_map[TC_BITMASK + 1]; #if IS_ENABLED(CONFIG_FCOE) unsigned int fcoe_ddp_xid; #endif #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) struct netprio_map __rcu *priomap; #endif struct phy_link_topology *link_topo; struct phy_device *phydev; struct sfp_bus *sfp_bus; struct lock_class_key *qdisc_tx_busylock; bool proto_down; bool irq_affinity_auto; bool rx_cpu_rmap_auto; /* priv_flags_slow, ungrouped to save space */ unsigned long see_all_hwtstamp_requests:1; unsigned long change_proto_down:1; unsigned long netns_immutable:1; unsigned long fcoe_mtu:1; struct list_head net_notifier_list; #if IS_ENABLED(CONFIG_MACSEC) /* MACsec management functions */ const struct macsec_ops *macsec_ops; #endif const struct udp_tunnel_nic_info *udp_tunnel_nic_info; struct udp_tunnel_nic *udp_tunnel_nic; /** @cfg: net_device queue-related configuration */ struct netdev_config *cfg; /** * @cfg_pending: same as @cfg but when device is being actively * reconfigured includes any changes to the configuration * requested by the user, but which may or may not be rejected. */ struct netdev_config *cfg_pending; struct ethtool_netdev_state *ethtool; /* protected by rtnl_lock */ struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; u8 dev_addr_shadow[MAX_ADDR_LEN]; netdevice_tracker linkwatch_dev_tracker; netdevice_tracker watchdog_dev_tracker; netdevice_tracker dev_registered_tracker; struct rtnl_hw_stats64 *offload_xstats_l3; struct devlink_port *devlink_port; #if IS_ENABLED(CONFIG_DPLL) struct dpll_pin __rcu *dpll_pin; #endif #if IS_ENABLED(CONFIG_PAGE_POOL) /** @page_pools: page pools created for this netdevice */ struct hlist_head page_pools; #endif /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */ struct dim_irq_moder *irq_moder; u64 max_pacing_offload_horizon; struct napi_config *napi_config; u32 num_napi_configs; u32 napi_defer_hard_irqs; unsigned long gro_flush_timeout; /** * @up: copy of @state's IFF_UP, but safe to read with just @lock. * May report false negatives while the device is being opened * or closed (@lock does not protect .ndo_open, or .ndo_close). */ bool up; /** * @request_ops_lock: request the core to run all @netdev_ops and * @ethtool_ops under the @lock. */ bool request_ops_lock; /** * @lock: netdev-scope lock, protects a small selection of fields. * Should always be taken using netdev_lock() / netdev_unlock() helpers. * Drivers are free to use it for other protection. * * For the drivers that implement shaper or queue API, the scope * of this lock is expanded to cover most ndo/queue/ethtool/sysfs * operations. Drivers may opt-in to this behavior by setting * @request_ops_lock. * * @lock protection mixes with rtnl_lock in multiple ways, fields are * either: * * - simply protected by the instance @lock; * * - double protected - writers hold both locks, readers hold either; * * - ops protected - protected by the lock held around the NDOs * and other callbacks, that is the instance lock on devices for * which netdev_need_ops_lock() returns true, otherwise by rtnl_lock; * * - double ops protected - always protected by rtnl_lock but for * devices for which netdev_need_ops_lock() returns true - also * the instance lock. * * Simply protects: * @gro_flush_timeout, @napi_defer_hard_irqs, @napi_list, * @net_shaper_hierarchy, @reg_state, @threaded * * Double protects: * @up, @moving_ns, @nd_net, @xdp_features * * Double ops protects: * @real_num_rx_queues, @real_num_tx_queues * * Also protects some fields in: * struct napi_struct, struct netdev_queue, struct netdev_rx_queue * * Ordering: take after rtnl_lock. */ struct mutex lock; #if IS_ENABLED(CONFIG_NET_SHAPER) /** * @net_shaper_hierarchy: data tracking the current shaper status * see include/net/net_shapers.h */ struct net_shaper_hierarchy *net_shaper_hierarchy; #endif struct hlist_head neighbours[NEIGH_NR_TABLES]; struct hwtstamp_provider __rcu *hwprov; u8 priv[] ____cacheline_aligned __counted_by(priv_len); } ____cacheline_aligned; #define to_net_dev(d) container_of(d, struct net_device, dev) /* * Driver should use this to assign devlink port instance to a netdevice * before it registers the netdevice. Therefore devlink_port is static * during the netdev lifetime after it is registered. */ #define SET_NETDEV_DEVLINK_PORT(dev, port) \ ({ \ WARN_ON((dev)->reg_state != NETREG_UNINITIALIZED); \ ((dev)->devlink_port = (port)); \ }) static inline bool netif_elide_gro(const struct net_device *dev) { if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog) return true; return false; } #define NETDEV_ALIGN 32 static inline int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio) { return dev->prio_tc_map[prio & TC_BITMASK]; } static inline int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc) { if (tc >= dev->num_tc) return -EINVAL; dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK; return 0; } int netdev_txq_to_tc(struct net_device *dev, unsigned int txq); void netdev_reset_tc(struct net_device *dev); int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset); int netdev_set_num_tc(struct net_device *dev, u8 num_tc); static inline int netdev_get_num_tc(struct net_device *dev) { return dev->num_tc; } static inline void net_prefetch(void *p) { prefetch(p); #if L1_CACHE_BYTES < 128 prefetch((u8 *)p + L1_CACHE_BYTES); #endif } static inline void net_prefetchw(void *p) { prefetchw(p); #if L1_CACHE_BYTES < 128 prefetchw((u8 *)p + L1_CACHE_BYTES); #endif } void netdev_unbind_sb_channel(struct net_device *dev, struct net_device *sb_dev); int netdev_bind_sb_channel_queue(struct net_device *dev, struct net_device *sb_dev, u8 tc, u16 count, u16 offset); int netdev_set_sb_channel(struct net_device *dev, u16 channel); static inline int netdev_get_sb_channel(struct net_device *dev) { return max_t(int, -dev->num_tc, 0); } static inline struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, unsigned int index) { DEBUG_NET_WARN_ON_ONCE(index >= dev->num_tx_queues); return &dev->_tx[index]; } static inline struct netdev_queue *skb_get_tx_queue(const struct net_device *dev, const struct sk_buff *skb) { return netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); } static inline void netdev_for_each_tx_queue(struct net_device *dev, void (*f)(struct net_device *, struct netdev_queue *, void *), void *arg) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) f(dev, &dev->_tx[i], arg); } u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); /* returns the headroom that the master device needs to take in account * when forwarding to this dev */ static inline unsigned netdev_get_fwd_headroom(struct net_device *dev) { return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom; } static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr) { if (dev->netdev_ops->ndo_set_rx_headroom) dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr); } /* set the device rx headroom to the dev's default */ static inline void netdev_reset_rx_headroom(struct net_device *dev) { netdev_set_rx_headroom(dev, -1); } static inline void *netdev_get_ml_priv(struct net_device *dev, enum netdev_ml_priv_type type) { if (dev->ml_priv_type != type) return NULL; return dev->ml_priv; } static inline void netdev_set_ml_priv(struct net_device *dev, void *ml_priv, enum netdev_ml_priv_type type) { WARN(dev->ml_priv_type && dev->ml_priv_type != type, "Overwriting already set ml_priv_type (%u) with different ml_priv_type (%u)!\n", dev->ml_priv_type, type); WARN(!dev->ml_priv_type && dev->ml_priv, "Overwriting already set ml_priv and ml_priv_type is ML_PRIV_NONE!\n"); dev->ml_priv = ml_priv; dev->ml_priv_type = type; } /* * Net namespace inlines */ static inline struct net *dev_net(const struct net_device *dev) { return read_pnet(&dev->nd_net); } static inline struct net *dev_net_rcu(const struct net_device *dev) { return read_pnet_rcu(&dev->nd_net); } static inline void dev_net_set(struct net_device *dev, struct net *net) { write_pnet(&dev->nd_net, net); } /** * netdev_priv - access network device private data * @dev: network device * * Get network device private data */ static inline void *netdev_priv(const struct net_device *dev) { return (void *)dev->priv; } /* Set the sysfs physical device reference for the network logical device * if set prior to registration will cause a symlink during initialization. */ #define SET_NETDEV_DEV(net, pdev) ((net)->dev.parent = (pdev)) /* Set the sysfs device type for the network logical device to allow * fine-grained identification of different network device types. For * example Ethernet, Wireless LAN, Bluetooth, WiMAX etc. */ #define SET_NETDEV_DEVTYPE(net, devtype) ((net)->dev.type = (devtype)) void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, enum netdev_queue_type type, struct napi_struct *napi); static inline void netdev_lock(struct net_device *dev) { mutex_lock(&dev->lock); } static inline void netdev_unlock(struct net_device *dev) { mutex_unlock(&dev->lock); } /* Additional netdev_lock()-related helpers are in net/netdev_lock.h */ void netif_napi_set_irq_locked(struct napi_struct *napi, int irq); static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) { netdev_lock(napi->dev); netif_napi_set_irq_locked(napi, irq); netdev_unlock(napi->dev); } /* Default NAPI poll() weight * Device drivers are strongly advised to not use bigger value */ #define NAPI_POLL_WEIGHT 64 void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight); static inline void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { netdev_lock(dev); netif_napi_add_weight_locked(dev, napi, poll, weight); netdev_unlock(dev); } /** * netif_napi_add() - initialize a NAPI context * @dev: network device * @napi: NAPI context * @poll: polling function * * netif_napi_add() must be used to initialize a NAPI context prior to calling * *any* of the other NAPI-related functions. */ static inline void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int)) { netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } static inline void netif_napi_add_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int)) { netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT); } static inline void netif_napi_add_tx_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state); netif_napi_add_weight(dev, napi, poll, weight); } static inline void netif_napi_add_config_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int index) { napi->index = index; napi->config = &dev->napi_config[index]; netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT); } /** * netif_napi_add_config - initialize a NAPI context with persistent config * @dev: network device * @napi: NAPI context * @poll: polling function * @index: the NAPI index */ static inline void netif_napi_add_config(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int index) { netdev_lock(dev); netif_napi_add_config_locked(dev, napi, poll, index); netdev_unlock(dev); } /** * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only * @dev: network device * @napi: NAPI context * @poll: polling function * * This variant of netif_napi_add() should be used from drivers using NAPI * to exclusively poll a TX queue. * This will avoid we add it into napi_hash[], thus polluting this hash table. */ static inline void netif_napi_add_tx(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int)) { netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT); } void __netif_napi_del_locked(struct napi_struct *napi); /** * __netif_napi_del - remove a NAPI context * @napi: NAPI context * * Warning: caller must observe RCU grace period before freeing memory * containing @napi. Drivers might want to call this helper to combine * all the needed RCU grace periods into a single one. */ static inline void __netif_napi_del(struct napi_struct *napi) { netdev_lock(napi->dev); __netif_napi_del_locked(napi); netdev_unlock(napi->dev); } static inline void netif_napi_del_locked(struct napi_struct *napi) { __netif_napi_del_locked(napi); synchronize_net(); } /** * netif_napi_del - remove a NAPI context * @napi: NAPI context * * netif_napi_del() removes a NAPI context from the network device NAPI list */ static inline void netif_napi_del(struct napi_struct *napi) { __netif_napi_del(napi); synchronize_net(); } int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs); void netif_set_affinity_auto(struct net_device *dev); struct packet_type { __be16 type; /* This is really htons(ether_type). */ bool ignore_outgoing; struct net_device *dev; /* NULL is wildcarded here */ netdevice_tracker dev_tracker; int (*func) (struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); void (*list_func) (struct list_head *, struct packet_type *, struct net_device *); bool (*id_match)(struct packet_type *ptype, struct sock *sk); struct net *af_packet_net; void *af_packet_priv; struct list_head list; }; struct offload_callbacks { struct sk_buff *(*gso_segment)(struct sk_buff *skb, netdev_features_t features); struct sk_buff *(*gro_receive)(struct list_head *head, struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb, int nhoff); }; struct packet_offload { __be16 type; /* This is really htons(ether_type). */ u16 priority; struct offload_callbacks callbacks; struct list_head list; }; /* often modified stats are per-CPU, other are shared (netdev->stats) */ struct pcpu_sw_netstats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; } __aligned(4 * sizeof(u64)); struct pcpu_dstats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t tx_packets; u64_stats_t tx_bytes; u64_stats_t rx_drops; u64_stats_t tx_drops; struct u64_stats_sync syncp; } __aligned(8 * sizeof(u64)); struct pcpu_lstats { u64_stats_t packets; u64_stats_t bytes; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes); static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int len) { struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); u64_stats_add(&tstats->rx_bytes, len); u64_stats_inc(&tstats->rx_packets); u64_stats_update_end(&tstats->syncp); } static inline void dev_sw_netstats_tx_add(struct net_device *dev, unsigned int packets, unsigned int len) { struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&tstats->syncp); u64_stats_add(&tstats->tx_bytes, len); u64_stats_add(&tstats->tx_packets, packets); u64_stats_update_end(&tstats->syncp); } static inline void dev_lstats_add(struct net_device *dev, unsigned int len) { struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats); u64_stats_update_begin(&lstats->syncp); u64_stats_add(&lstats->bytes, len); u64_stats_inc(&lstats->packets); u64_stats_update_end(&lstats->syncp); } static inline void dev_dstats_rx_add(struct net_device *dev, unsigned int len) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); u64_stats_inc(&dstats->rx_packets); u64_stats_add(&dstats->rx_bytes, len); u64_stats_update_end(&dstats->syncp); } static inline void dev_dstats_rx_dropped(struct net_device *dev) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); u64_stats_inc(&dstats->rx_drops); u64_stats_update_end(&dstats->syncp); } static inline void dev_dstats_rx_dropped_add(struct net_device *dev, unsigned int packets) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); u64_stats_add(&dstats->rx_drops, packets); u64_stats_update_end(&dstats->syncp); } static inline void dev_dstats_tx_add(struct net_device *dev, unsigned int len) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); u64_stats_inc(&dstats->tx_packets); u64_stats_add(&dstats->tx_bytes, len); u64_stats_update_end(&dstats->syncp); } static inline void dev_dstats_tx_dropped(struct net_device *dev) { struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats); u64_stats_update_begin(&dstats->syncp); u64_stats_inc(&dstats->tx_drops); u64_stats_update_end(&dstats->syncp); } #define __netdev_alloc_pcpu_stats(type, gfp) \ ({ \ typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\ if (pcpu_stats) { \ int __cpu; \ for_each_possible_cpu(__cpu) { \ typeof(type) *stat; \ stat = per_cpu_ptr(pcpu_stats, __cpu); \ u64_stats_init(&stat->syncp); \ } \ } \ pcpu_stats; \ }) #define netdev_alloc_pcpu_stats(type) \ __netdev_alloc_pcpu_stats(type, GFP_KERNEL) #define devm_netdev_alloc_pcpu_stats(dev, type) \ ({ \ typeof(type) __percpu *pcpu_stats = devm_alloc_percpu(dev, type);\ if (pcpu_stats) { \ int __cpu; \ for_each_possible_cpu(__cpu) { \ typeof(type) *stat; \ stat = per_cpu_ptr(pcpu_stats, __cpu); \ u64_stats_init(&stat->syncp); \ } \ } \ pcpu_stats; \ }) enum netdev_lag_tx_type { NETDEV_LAG_TX_TYPE_UNKNOWN, NETDEV_LAG_TX_TYPE_RANDOM, NETDEV_LAG_TX_TYPE_BROADCAST, NETDEV_LAG_TX_TYPE_ROUNDROBIN, NETDEV_LAG_TX_TYPE_ACTIVEBACKUP, NETDEV_LAG_TX_TYPE_HASH, }; enum netdev_lag_hash { NETDEV_LAG_HASH_NONE, NETDEV_LAG_HASH_L2, NETDEV_LAG_HASH_L34, NETDEV_LAG_HASH_L23, NETDEV_LAG_HASH_E23, NETDEV_LAG_HASH_E34, NETDEV_LAG_HASH_VLAN_SRCMAC, NETDEV_LAG_HASH_UNKNOWN, }; struct netdev_lag_upper_info { enum netdev_lag_tx_type tx_type; enum netdev_lag_hash hash_type; }; struct netdev_lag_lower_state_info { u8 link_up : 1, tx_enabled : 1; }; #include <linux/notifier.h> /* netdevice notifier chain. Please remember to update netdev_cmd_to_name() * and the rtnetlink notification exclusion list in rtnetlink_event() when * adding new types. */ enum netdev_cmd { NETDEV_UP = 1, /* For now you can't veto a device up/down */ NETDEV_DOWN, NETDEV_REBOOT, /* Tell a protocol stack a network interface detected a hardware crash and restarted - we can use this eg to kick tcp sessions once done */ NETDEV_CHANGE, /* Notify device state change */ NETDEV_REGISTER, NETDEV_UNREGISTER, NETDEV_CHANGEMTU, /* notify after mtu change happened */ NETDEV_CHANGEADDR, /* notify after the address change */ NETDEV_PRE_CHANGEADDR, /* notify before the address change */ NETDEV_GOING_DOWN, NETDEV_CHANGENAME, NETDEV_FEAT_CHANGE, NETDEV_BONDING_FAILOVER, NETDEV_PRE_UP, NETDEV_PRE_TYPE_CHANGE, NETDEV_POST_TYPE_CHANGE, NETDEV_POST_INIT, NETDEV_PRE_UNINIT, NETDEV_RELEASE, NETDEV_NOTIFY_PEERS, NETDEV_JOIN, NETDEV_CHANGEUPPER, NETDEV_RESEND_IGMP, NETDEV_PRECHANGEMTU, /* notify before mtu change happened */ NETDEV_CHANGEINFODATA, NETDEV_BONDING_INFO, NETDEV_PRECHANGEUPPER, NETDEV_CHANGELOWERSTATE, NETDEV_UDP_TUNNEL_PUSH_INFO, NETDEV_UDP_TUNNEL_DROP_INFO, NETDEV_CHANGE_TX_QUEUE_LEN, NETDEV_CVLAN_FILTER_PUSH_INFO, NETDEV_CVLAN_FILTER_DROP_INFO, NETDEV_SVLAN_FILTER_PUSH_INFO, NETDEV_SVLAN_FILTER_DROP_INFO, NETDEV_OFFLOAD_XSTATS_ENABLE, NETDEV_OFFLOAD_XSTATS_DISABLE, NETDEV_OFFLOAD_XSTATS_REPORT_USED, NETDEV_OFFLOAD_XSTATS_REPORT_DELTA, NETDEV_XDP_FEAT_CHANGE, }; const char *netdev_cmd_to_name(enum netdev_cmd cmd); int register_netdevice_notifier(struct notifier_block *nb); int unregister_netdevice_notifier(struct notifier_block *nb); int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn); int unregister_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn); struct netdev_notifier_info { struct net_device *dev; struct netlink_ext_ack *extack; }; struct netdev_notifier_info_ext { struct netdev_notifier_info info; /* must be first */ union { u32 mtu; } ext; }; struct netdev_notifier_change_info { struct netdev_notifier_info info; /* must be first */ unsigned int flags_changed; }; struct netdev_notifier_changeupper_info { struct netdev_notifier_info info; /* must be first */ struct net_device *upper_dev; /* new upper dev */ bool master; /* is upper dev master */ bool linking; /* is the notification for link or unlink */ void *upper_info; /* upper dev info */ }; struct netdev_notifier_changelowerstate_info { struct netdev_notifier_info info; /* must be first */ void *lower_state_info; /* is lower dev state */ }; struct netdev_notifier_pre_changeaddr_info { struct netdev_notifier_info info; /* must be first */ const unsigned char *dev_addr; }; enum netdev_offload_xstats_type { NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1, }; struct netdev_notifier_offload_xstats_info { struct netdev_notifier_info info; /* must be first */ enum netdev_offload_xstats_type type; union { /* NETDEV_OFFLOAD_XSTATS_REPORT_DELTA */ struct netdev_notifier_offload_xstats_rd *report_delta; /* NETDEV_OFFLOAD_XSTATS_REPORT_USED */ struct netdev_notifier_offload_xstats_ru *report_used; }; }; int netdev_offload_xstats_enable(struct net_device *dev, enum netdev_offload_xstats_type type, struct netlink_ext_ack *extack); int netdev_offload_xstats_disable(struct net_device *dev, enum netdev_offload_xstats_type type); bool netdev_offload_xstats_enabled(const struct net_device *dev, enum netdev_offload_xstats_type type); int netdev_offload_xstats_get(struct net_device *dev, enum netdev_offload_xstats_type type, struct rtnl_hw_stats64 *stats, bool *used, struct netlink_ext_ack *extack); void netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *rd, const struct rtnl_hw_stats64 *stats); void netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *ru); void netdev_offload_xstats_push_delta(struct net_device *dev, enum netdev_offload_xstats_type type, const struct rtnl_hw_stats64 *stats); static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, struct net_device *dev) { info->dev = dev; info->extack = NULL; } static inline struct net_device * netdev_notifier_info_to_dev(const struct netdev_notifier_info *info) { return info->dev; } static inline struct netlink_ext_ack * netdev_notifier_info_to_extack(const struct netdev_notifier_info *info) { return info->extack; } int call_netdevice_notifiers(unsigned long val, struct net_device *dev); int call_netdevice_notifiers_info(unsigned long val, struct netdev_notifier_info *info); #define for_each_netdev(net, d) \ list_for_each_entry(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_reverse(net, d) \ list_for_each_entry_reverse(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_rcu(net, d) \ list_for_each_entry_rcu(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_safe(net, d, n) \ list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list) #define for_each_netdev_continue(net, d) \ list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_continue_reverse(net, d) \ list_for_each_entry_continue_reverse(d, &(net)->dev_base_head, \ dev_list) #define for_each_netdev_continue_rcu(net, d) \ list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list) #define for_each_netdev_in_bond_rcu(bond, slave) \ for_each_netdev_rcu(dev_net_rcu(bond), slave) \ if (netdev_master_upper_dev_get_rcu(slave) == (bond)) #define net_device_entry(lh) list_entry(lh, struct net_device, dev_list) #define for_each_netdev_dump(net, d, ifindex) \ for (; (d = xa_find(&(net)->dev_by_index, &ifindex, \ ULONG_MAX, XA_PRESENT)); ifindex++) static inline struct net_device *next_net_device(struct net_device *dev) { struct list_head *lh; struct net *net; net = dev_net(dev); lh = dev->dev_list.next; return lh == &net->dev_base_head ? NULL : net_device_entry(lh); } static inline struct net_device *next_net_device_rcu(struct net_device *dev) { struct list_head *lh; struct net *net; net = dev_net(dev); lh = rcu_dereference(list_next_rcu(&dev->dev_list)); return lh == &net->dev_base_head ? NULL : net_device_entry(lh); } static inline struct net_device *first_net_device(struct net *net) { return list_empty(&net->dev_base_head) ? NULL : net_device_entry(net->dev_base_head.next); } int netdev_boot_setup_check(struct net_device *dev); struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, const char *hwaddr); struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *hwaddr); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type); void dev_add_pack(struct packet_type *pt); void dev_remove_pack(struct packet_type *pt); void __dev_remove_pack(struct packet_type *pt); void dev_add_offload(struct packet_offload *po); void dev_remove_offload(struct packet_offload *po); int dev_get_iflink(const struct net_device *dev); int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, struct net_device_path_stack *stack); struct net_device *dev_get_by_name(struct net *net, const char *name); struct net_device *dev_get_by_name_rcu(struct net *net, const char *name); struct net_device *__dev_get_by_name(struct net *net, const char *name); bool netdev_name_in_use(struct net *net, const char *name); int dev_alloc_name(struct net_device *dev, const char *name); int netif_open(struct net_device *dev, struct netlink_ext_ack *extack); int dev_open(struct net_device *dev, struct netlink_ext_ack *extack); void netif_close(struct net_device *dev); void dev_close(struct net_device *dev); void netif_close_many(struct list_head *head, bool unlink); void netif_disable_lro(struct net_device *dev); void dev_disable_lro(struct net_device *dev); int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb); u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev); int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id); static inline int dev_queue_xmit(struct sk_buff *skb) { return __dev_queue_xmit(skb, NULL); } static inline int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev) { return __dev_queue_xmit(skb, sb_dev); } static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id) { int ret; ret = __dev_direct_xmit(skb, queue_id); if (!dev_xmit_complete(ret)) kfree_skb(skb); return ret; } int register_netdevice(struct net_device *dev); void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); void unregister_netdevice_many(struct list_head *head); static inline void unregister_netdevice(struct net_device *dev) { unregister_netdevice_queue(dev, NULL); } int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev); struct net_device *netdev_get_xmit_slave(struct net_device *dev, struct sk_buff *skb, bool all_slaves); struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev, struct sock *sk); struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *netdev_get_by_index(struct net *net, int ifindex, netdevice_tracker *tracker, gfp_t gfp); struct net_device *netdev_get_by_name(struct net *net, const char *name, netdevice_tracker *tracker, gfp_t gfp); struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker, unsigned short flags, unsigned short mask); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); void netdev_copy_name(struct net_device *dev, char *name); static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { if (!dev->header_ops || !dev->header_ops->create) return 0; return dev->header_ops->create(skb, dev, type, daddr, saddr, len); } static inline int dev_parse_header(const struct sk_buff *skb, unsigned char *haddr) { const struct net_device *dev = skb->dev; if (!dev->header_ops || !dev->header_ops->parse) return 0; return dev->header_ops->parse(skb, haddr); } static inline __be16 dev_parse_header_protocol(const struct sk_buff *skb) { const struct net_device *dev = skb->dev; if (!dev->header_ops || !dev->header_ops->parse_protocol) return 0; return dev->header_ops->parse_protocol(skb); } /* ll_header must have at least hard_header_len allocated */ static inline bool dev_validate_header(const struct net_device *dev, char *ll_header, int len) { if (likely(len >= dev->hard_header_len)) return true; if (len < dev->min_header_len) return false; if (capable(CAP_SYS_RAWIO)) { memset(ll_header + len, 0, dev->hard_header_len - len); return true; } if (dev->header_ops && dev->header_ops->validate) return dev->header_ops->validate(ll_header, len); return false; } static inline bool dev_has_header(const struct net_device *dev) { return dev->header_ops && dev->header_ops->create; } struct numa_drop_counters { atomic_t drops0 ____cacheline_aligned_in_smp; atomic_t drops1 ____cacheline_aligned_in_smp; }; static inline int numa_drop_read(const struct numa_drop_counters *ndc) { return atomic_read(&ndc->drops0) + atomic_read(&ndc->drops1); } static inline void numa_drop_add(struct numa_drop_counters *ndc, int val) { int n = numa_node_id() % 2; if (n) atomic_add(val, &ndc->drops1); else atomic_add(val, &ndc->drops0); } static inline void numa_drop_reset(struct numa_drop_counters *ndc) { atomic_set(&ndc->drops0, 0); atomic_set(&ndc->drops1, 0); } /* * Incoming packets are placed on per-CPU queues */ struct softnet_data { struct list_head poll_list; struct sk_buff_head process_queue; local_lock_t process_queue_bh_lock; /* stats */ unsigned int processed; unsigned int time_squeeze; #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list; #endif unsigned int received_rps; bool in_net_rx_action; bool in_napi_threaded_poll; #ifdef CONFIG_NET_FLOW_LIMIT struct sd_flow_limit __rcu *flow_limit; #endif struct Qdisc *output_queue; struct Qdisc **output_queue_tailp; struct sk_buff *completion_queue; #ifdef CONFIG_XFRM_OFFLOAD struct sk_buff_head xfrm_backlog; #endif /* written and read only by owning cpu: */ struct netdev_xmit xmit; #ifdef CONFIG_RPS /* input_queue_head should be written by cpu owning this struct, * and only read by other cpus. Worth using a cache line. */ unsigned int input_queue_head ____cacheline_aligned_in_smp; /* Elements below can be accessed between CPUs for RPS/RFS */ call_single_data_t csd ____cacheline_aligned_in_smp; struct softnet_data *rps_ipi_next; unsigned int cpu; unsigned int input_queue_tail; #endif struct sk_buff_head input_pkt_queue; struct napi_struct backlog; struct numa_drop_counters drop_counters; int defer_ipi_scheduled ____cacheline_aligned_in_smp; call_single_data_t defer_csd; }; DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); struct page_pool_bh { struct page_pool *pool; local_lock_t bh_lock; }; DECLARE_PER_CPU(struct page_pool_bh, system_page_pool); #ifndef CONFIG_PREEMPT_RT static inline int dev_recursion_level(void) { return this_cpu_read(softnet_data.xmit.recursion); } #else static inline int dev_recursion_level(void) { return current->net_xmit.recursion; } #endif void __netif_schedule(struct Qdisc *q); void netif_schedule_queue(struct netdev_queue *txq); static inline void netif_tx_schedule_all(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) netif_schedule_queue(netdev_get_tx_queue(dev, i)); } static __always_inline void netif_tx_start_queue(struct netdev_queue *dev_queue) { clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } /** * netif_start_queue - allow transmit * @dev: network device * * Allow upper layers to call the device hard_start_xmit routine. */ static inline void netif_start_queue(struct net_device *dev) { netif_tx_start_queue(netdev_get_tx_queue(dev, 0)); } static inline void netif_tx_start_all_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); netif_tx_start_queue(txq); } } void netif_tx_wake_queue(struct netdev_queue *dev_queue); /** * netif_wake_queue - restart transmit * @dev: network device * * Allow upper layers to call the device hard_start_xmit routine. * Used for flow control when transmit resources are available. */ static inline void netif_wake_queue(struct net_device *dev) { netif_tx_wake_queue(netdev_get_tx_queue(dev, 0)); } static inline void netif_tx_wake_all_queues(struct net_device *dev) { unsigned int i; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *txq = netdev_get_tx_queue(dev, i); netif_tx_wake_queue(txq); } } static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue) { /* Paired with READ_ONCE() from dev_watchdog() */ WRITE_ONCE(dev_queue->trans_start, jiffies); /* This barrier is paired with smp_mb() from dev_watchdog() */ smp_mb__before_atomic(); /* Must be an atomic op see netif_txq_try_stop() */ set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } /** * netif_stop_queue - stop transmitted packets * @dev: network device * * Stop upper layers calling the device hard_start_xmit routine. * Used for flow control when transmit resources are unavailable. */ static inline void netif_stop_queue(struct net_device *dev) { netif_tx_stop_queue(netdev_get_tx_queue(dev, 0)); } void netif_tx_stop_all_queues(struct net_device *dev); static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue) { return test_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } /** * netif_queue_stopped - test if transmit queue is flowblocked * @dev: network device * * Test if transmit queue on device is currently unable to send. */ static inline bool netif_queue_stopped(const struct net_device *dev) { return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0)); } static inline bool netif_xmit_stopped(const struct netdev_queue *dev_queue) { return dev_queue->state & QUEUE_STATE_ANY_XOFF; } static inline bool netif_xmit_frozen_or_stopped(const struct netdev_queue *dev_queue) { return dev_queue->state & QUEUE_STATE_ANY_XOFF_OR_FROZEN; } static inline bool netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue) { return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN; } /** * netdev_queue_set_dql_min_limit - set dql minimum limit * @dev_queue: pointer to transmit queue * @min_limit: dql minimum limit * * Forces xmit_more() to return true until the minimum threshold * defined by @min_limit is reached (or until the tx queue is * empty). Warning: to be use with care, misuse will impact the * latency. */ static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue, unsigned int min_limit) { #ifdef CONFIG_BQL dev_queue->dql.min_limit = min_limit; #endif } static inline int netdev_queue_dql_avail(const struct netdev_queue *txq) { #ifdef CONFIG_BQL /* Non-BQL migrated drivers will return 0, too. */ return dql_avail(&txq->dql); #else return 0; #endif } /** * netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write * @dev_queue: pointer to transmit queue * * BQL enabled drivers might use this helper in their ndo_start_xmit(), * to give appropriate hint to the CPU. */ static inline void netdev_txq_bql_enqueue_prefetchw(struct netdev_queue *dev_queue) { #ifdef CONFIG_BQL prefetchw(&dev_queue->dql.num_queued); #endif } /** * netdev_txq_bql_complete_prefetchw - prefetch bql data for write * @dev_queue: pointer to transmit queue * * BQL enabled drivers might use this helper in their TX completion path, * to give appropriate hint to the CPU. */ static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_queue) { #ifdef CONFIG_BQL prefetchw(&dev_queue->dql.limit); #endif } /** * netdev_tx_sent_queue - report the number of bytes queued to a given tx queue * @dev_queue: network device queue * @bytes: number of bytes queued to the device queue * * Report the number of bytes queued for sending/completion to the network * device hardware queue. @bytes should be a good approximation and should * exactly match netdev_completed_queue() @bytes. * This is typically called once per packet, from ndo_start_xmit(). */ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, unsigned int bytes) { #ifdef CONFIG_BQL dql_queued(&dev_queue->dql, bytes); if (likely(dql_avail(&dev_queue->dql) >= 0)) return; /* Paired with READ_ONCE() from dev_watchdog() */ WRITE_ONCE(dev_queue->trans_start, jiffies); /* This barrier is paired with smp_mb() from dev_watchdog() */ smp_mb__before_atomic(); set_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state); /* * The XOFF flag must be set before checking the dql_avail below, * because in netdev_tx_completed_queue we update the dql_completed * before checking the XOFF flag. */ smp_mb__after_atomic(); /* check again in case another CPU has just made room avail */ if (unlikely(dql_avail(&dev_queue->dql) >= 0)) clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state); #endif } /* Variant of netdev_tx_sent_queue() for drivers that are aware * that they should not test BQL status themselves. * We do want to change __QUEUE_STATE_STACK_XOFF only for the last * skb of a batch. * Returns true if the doorbell must be used to kick the NIC. */ static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue, unsigned int bytes, bool xmit_more) { if (xmit_more) { #ifdef CONFIG_BQL dql_queued(&dev_queue->dql, bytes); #endif return netif_tx_queue_stopped(dev_queue); } netdev_tx_sent_queue(dev_queue, bytes); return true; } /** * netdev_sent_queue - report the number of bytes queued to hardware * @dev: network device * @bytes: number of bytes queued to the hardware device queue * * Report the number of bytes queued for sending/completion to the network * device hardware queue#0. @bytes should be a good approximation and should * exactly match netdev_completed_queue() @bytes. * This is typically called once per packet, from ndo_start_xmit(). */ static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes) { netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes); } static inline bool __netdev_sent_queue(struct net_device *dev, unsigned int bytes, bool xmit_more) { return __netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes, xmit_more); } /** * netdev_tx_completed_queue - report number of packets/bytes at TX completion. * @dev_queue: network device queue * @pkts: number of packets (currently ignored) * @bytes: number of bytes dequeued from the device queue * * Must be called at most once per TX completion round (and not per * individual packet), so that BQL can adjust its limits appropriately. */ static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue, unsigned int pkts, unsigned int bytes) { #ifdef CONFIG_BQL if (unlikely(!bytes)) return; dql_completed(&dev_queue->dql, bytes); /* * Without the memory barrier there is a small possibility that * netdev_tx_sent_queue will miss the update and cause the queue to * be stopped forever */ smp_mb(); /* NOTE: netdev_txq_completed_mb() assumes this exists */ if (unlikely(dql_avail(&dev_queue->dql) < 0)) return; if (test_and_clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state)) netif_schedule_queue(dev_queue); #endif } /** * netdev_completed_queue - report bytes and packets completed by device * @dev: network device * @pkts: actual number of packets sent over the medium * @bytes: actual number of bytes sent over the medium * * Report the number of bytes and packets transmitted by the network device * hardware queue over the physical medium, @bytes must exactly match the * @bytes amount passed to netdev_sent_queue() */ static inline void netdev_completed_queue(struct net_device *dev, unsigned int pkts, unsigned int bytes) { netdev_tx_completed_queue(netdev_get_tx_queue(dev, 0), pkts, bytes); } static inline void netdev_tx_reset_queue(struct netdev_queue *q) { #ifdef CONFIG_BQL clear_bit(__QUEUE_STATE_STACK_XOFF, &q->state); dql_reset(&q->dql); #endif } /** * netdev_tx_reset_subqueue - reset the BQL stats and state of a netdev queue * @dev: network device * @qid: stack index of the queue to reset */ static inline void netdev_tx_reset_subqueue(const struct net_device *dev, u32 qid) { netdev_tx_reset_queue(netdev_get_tx_queue(dev, qid)); } /** * netdev_reset_queue - reset the packets and bytes count of a network device * @dev_queue: network device * * Reset the bytes and packet count of a network device and clear the * software flow control OFF bit for this network device */ static inline void netdev_reset_queue(struct net_device *dev_queue) { netdev_tx_reset_subqueue(dev_queue, 0); } /** * netdev_cap_txqueue - check if selected tx queue exceeds device queues * @dev: network device * @queue_index: given tx queue index * * Returns 0 if given tx queue index >= number of device tx queues, * otherwise returns the originally passed tx queue index. */ static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index) { if (unlikely(queue_index >= dev->real_num_tx_queues)) { net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n", dev->name, queue_index, dev->real_num_tx_queues); return 0; } return queue_index; } /** * netif_running - test if up * @dev: network device * * Test if the device has been brought up. */ static inline bool netif_running(const struct net_device *dev) { return test_bit(__LINK_STATE_START, &dev->state); } /* * Routines to manage the subqueues on a device. We only need start, * stop, and a check if it's stopped. All other device management is * done at the overall netdevice level. * Also test the device if we're multiqueue. */ /** * netif_start_subqueue - allow sending packets on subqueue * @dev: network device * @queue_index: sub queue index * * Start individual transmit queue of a device with multiple transmit queues. */ static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index) { struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); netif_tx_start_queue(txq); } /** * netif_stop_subqueue - stop sending packets on subqueue * @dev: network device * @queue_index: sub queue index * * Stop individual transmit queue of a device with multiple transmit queues. */ static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index) { struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); netif_tx_stop_queue(txq); } /** * __netif_subqueue_stopped - test status of subqueue * @dev: network device * @queue_index: sub queue index * * Check individual transmit queue of a device with multiple transmit queues. */ static inline bool __netif_subqueue_stopped(const struct net_device *dev, u16 queue_index) { struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); return netif_tx_queue_stopped(txq); } /** * netif_subqueue_stopped - test status of subqueue * @dev: network device * @skb: sub queue buffer pointer * * Check individual transmit queue of a device with multiple transmit queues. */ static inline bool netif_subqueue_stopped(const struct net_device *dev, struct sk_buff *skb) { return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb)); } /** * netif_wake_subqueue - allow sending packets on subqueue * @dev: network device * @queue_index: sub queue index * * Resume individual transmit queue of a device with multiple transmit queues. */ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index) { struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); netif_tx_wake_queue(txq); } #ifdef CONFIG_XPS int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index); int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type); /** * netif_attr_test_mask - Test a CPU or Rx queue set in a mask * @j: CPU/Rx queue index * @mask: bitmask of all cpus/rx queues * @nr_bits: number of bits in the bitmask * * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues. */ static inline bool netif_attr_test_mask(unsigned long j, const unsigned long *mask, unsigned int nr_bits) { cpu_max_bits_warn(j, nr_bits); return test_bit(j, mask); } /** * netif_attr_test_online - Test for online CPU/Rx queue * @j: CPU/Rx queue index * @online_mask: bitmask for CPUs/Rx queues that are online * @nr_bits: number of bits in the bitmask * * Returns: true if a CPU/Rx queue is online. */ static inline bool netif_attr_test_online(unsigned long j, const unsigned long *online_mask, unsigned int nr_bits) { cpu_max_bits_warn(j, nr_bits); if (online_mask) return test_bit(j, online_mask); return (j < nr_bits); } /** * netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask * @n: CPU/Rx queue index * @srcp: the cpumask/Rx queue mask pointer * @nr_bits: number of bits in the bitmask * * Returns: next (after n) CPU/Rx queue index in the mask; * >= nr_bits if no further CPUs/Rx queues set. */ static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp, unsigned int nr_bits) { /* -1 is a legal arg here. */ if (n != -1) cpu_max_bits_warn(n, nr_bits); if (srcp) return find_next_bit(srcp, nr_bits, n + 1); return n + 1; } /** * netif_attrmask_next_and - get the next CPU/Rx queue in \*src1p & \*src2p * @n: CPU/Rx queue index * @src1p: the first CPUs/Rx queues mask pointer * @src2p: the second CPUs/Rx queues mask pointer * @nr_bits: number of bits in the bitmask * * Returns: next (after n) CPU/Rx queue index set in both masks; * >= nr_bits if no further CPUs/Rx queues set in both. */ static inline int netif_attrmask_next_and(int n, const unsigned long *src1p, const unsigned long *src2p, unsigned int nr_bits) { /* -1 is a legal arg here. */ if (n != -1) cpu_max_bits_warn(n, nr_bits); if (src1p && src2p) return find_next_and_bit(src1p, src2p, nr_bits, n + 1); else if (src1p) return find_next_bit(src1p, nr_bits, n + 1); else if (src2p) return find_next_bit(src2p, nr_bits, n + 1); return n + 1; } #else static inline int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, u16 index) { return 0; } static inline int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type) { return 0; } #endif /** * netif_is_multiqueue - test if device has multiple transmit queues * @dev: network device * * Check if device has multiple transmit queues */ static inline bool netif_is_multiqueue(const struct net_device *dev) { return dev->num_tx_queues > 1; } int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq); int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq); int netif_set_real_num_queues(struct net_device *dev, unsigned int txq, unsigned int rxq); int netif_get_num_default_rss_queues(void); void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason); void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason); /* * It is not allowed to call kfree_skb() or consume_skb() from hardware * interrupt context or with hardware interrupts being disabled. * (in_hardirq() || irqs_disabled()) * * We provide four helpers that can be used in following contexts : * * dev_kfree_skb_irq(skb) when caller drops a packet from irq context, * replacing kfree_skb(skb) * * dev_consume_skb_irq(skb) when caller consumes a packet from irq context. * Typically used in place of consume_skb(skb) in TX completion path * * dev_kfree_skb_any(skb) when caller doesn't know its current irq context, * replacing kfree_skb(skb) * * dev_consume_skb_any(skb) when caller doesn't know its current irq context, * and consumed a packet. Used in place of consume_skb(skb) */ static inline void dev_kfree_skb_irq(struct sk_buff *skb) { dev_kfree_skb_irq_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } static inline void dev_consume_skb_irq(struct sk_buff *skb) { dev_kfree_skb_irq_reason(skb, SKB_CONSUMED); } static inline void dev_kfree_skb_any(struct sk_buff *skb) { dev_kfree_skb_any_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } static inline void dev_consume_skb_any(struct sk_buff *skb) { dev_kfree_skb_any_reason(skb, SKB_CONSUMED); } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *xdp_prog); void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog); int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb); int netif_rx(struct sk_buff *skb); int __netif_rx(struct sk_buff *skb); int netif_receive_skb(struct sk_buff *skb); int netif_receiv |