Total coverage: 242346 (16%)of 1564090
118 187 197 233 233 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for diskquota-operations. When diskquota is configured these * macros expand to the right source-code. * * Author: Marco van Wieringen <mvw@planets.elm.net> */ #ifndef _LINUX_QUOTAOPS_ #define _LINUX_QUOTAOPS_ #include <linux/fs.h> #define DQUOT_SPACE_WARN 0x1 #define DQUOT_SPACE_RESERVE 0x2 #define DQUOT_SPACE_NOFAIL 0x4 static inline struct quota_info *sb_dqopt(struct super_block *sb) { return &sb->s_dquot; } /* i_mutex must being held */ static inline bool is_quota_modification(struct mnt_idmap *idmap, struct inode *inode, struct iattr *ia) { return ((ia->ia_valid & ATTR_SIZE) || i_uid_needs_update(idmap, ia, inode) || i_gid_needs_update(idmap, ia, inode)); } #if defined(CONFIG_QUOTA) #define quota_error(sb, fmt, args...) \ __quota_error((sb), __func__, fmt , ## args) extern __printf(3, 4) void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...); /* * declaration of quota_function calls in kernel. */ int dquot_initialize(struct inode *inode); bool dquot_initialize_needed(struct inode *inode); void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, struct kqid qid); static inline struct dquot *dqgrab(struct dquot *dquot) { /* Make sure someone else has active reference to dquot */ WARN_ON_ONCE(!atomic_read(&dquot->dq_count)); WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); atomic_inc(&dquot->dq_count); return dquot; } static inline bool dquot_is_busy(struct dquot *dquot) { if (test_bit(DQ_MOD_B, &dquot->dq_flags)) return true; if (atomic_read(&dquot->dq_count) > 0) return true; return false; } void dqput(struct dquot *dquot); int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), unsigned long priv); struct dquot *dquot_alloc(struct super_block *sb, int type); void dquot_destroy(struct dquot *dquot); int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags); void __dquot_free_space(struct inode *inode, qsize_t number, int flags); int dquot_alloc_inode(struct inode *inode); void dquot_claim_space_nodirty(struct inode *inode, qsize_t number); void dquot_free_inode(struct inode *inode); void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number); int dquot_disable(struct super_block *sb, int type, unsigned int flags); /* Suspend quotas on remount RO */ static inline int dquot_suspend(struct super_block *sb, int type) { return dquot_disable(sb, type, DQUOT_SUSPENDED); } int dquot_resume(struct super_block *sb, int type); int dquot_commit(struct dquot *dquot); int dquot_acquire(struct dquot *dquot); int dquot_release(struct dquot *dquot); int dquot_commit_info(struct super_block *sb, int type); int dquot_get_next_id(struct super_block *sb, struct kqid *qid); int dquot_mark_dquot_dirty(struct dquot *dquot); int dquot_file_open(struct inode *inode, struct file *file); int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags); int dquot_load_quota_inode(struct inode *inode, int type, int format_id, unsigned int flags); int dquot_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); int dquot_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type); int dquot_quota_off(struct super_block *sb, int type); int dquot_writeback_dquots(struct super_block *sb, int type); int dquot_quota_sync(struct super_block *sb, int type); int dquot_get_state(struct super_block *sb, struct qc_state *state); int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii); int dquot_get_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int dquot_get_next_dqblk(struct super_block *sb, struct kqid *id, struct qc_dqblk *di); int dquot_set_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int __dquot_transfer(struct inode *inode, struct dquot **transfer_to); int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr); static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) { return sb_dqopt(sb)->info + type; } /* * Functions for checking status of quota */ static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_USAGE_ENABLED, type); } static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_LIMITS_ENABLED, type); } static inline bool sb_has_quota_suspended(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_SUSPENDED, type); } static inline unsigned sb_any_quota_suspended(struct super_block *sb) { return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_SUSPENDED); } /* Does kernel know about any quota information for given sb + type? */ static inline bool sb_has_quota_loaded(struct super_block *sb, int type) { /* Currently if anything is on, then quota usage is on as well */ return sb_has_quota_usage_enabled(sb, type); } static inline unsigned sb_any_quota_loaded(struct super_block *sb) { return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_USAGE_ENABLED); } static inline bool sb_has_quota_active(struct super_block *sb, int type) { return sb_has_quota_loaded(sb, type) && !sb_has_quota_suspended(sb, type); } /* * Operations supported for diskquotas. */ extern const struct dquot_operations dquot_operations; extern const struct quotactl_ops dquot_quotactl_sysfile_ops; #else static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type) { return 0; } static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type) { return 0; } static inline int sb_has_quota_suspended(struct super_block *sb, int type) { return 0; } static inline int sb_any_quota_suspended(struct super_block *sb) { return 0; } /* Does kernel know about any quota information for given sb + type? */ static inline int sb_has_quota_loaded(struct super_block *sb, int type) { return 0; } static inline int sb_any_quota_loaded(struct super_block *sb) { return 0; } static inline int sb_has_quota_active(struct super_block *sb, int type) { return 0; } static inline int dquot_initialize(struct inode *inode) { return 0; } static inline bool dquot_initialize_needed(struct inode *inode) { return false; } static inline void dquot_drop(struct inode *inode) { } static inline int dquot_alloc_inode(struct inode *inode) { return 0; } static inline void dquot_free_inode(struct inode *inode) { } static inline int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode, struct iattr *iattr) { return 0; } static inline int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { if (!(flags & DQUOT_SPACE_RESERVE)) inode_add_bytes(inode, number); return 0; } static inline void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { if (!(flags & DQUOT_SPACE_RESERVE)) inode_sub_bytes(inode, number); } static inline void dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { inode_add_bytes(inode, number); } static inline int dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) { inode_sub_bytes(inode, number); return 0; } static inline int dquot_disable(struct super_block *sb, int type, unsigned int flags) { return 0; } static inline int dquot_suspend(struct super_block *sb, int type) { return 0; } static inline int dquot_resume(struct super_block *sb, int type) { return 0; } #define dquot_file_open generic_file_open static inline int dquot_writeback_dquots(struct super_block *sb, int type) { return 0; } #endif /* CONFIG_QUOTA */ static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN); } static inline void dquot_alloc_space_nofail(struct inode *inode, qsize_t nr) { __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN|DQUOT_SPACE_NOFAIL); mark_inode_dirty_sync(inode); } static inline int dquot_alloc_space(struct inode *inode, qsize_t nr) { int ret; ret = dquot_alloc_space_nodirty(inode, nr); if (!ret) { /* * Mark inode fully dirty. Since we are allocating blocks, inode * would become fully dirty soon anyway and it reportedly * reduces lock contention. */ mark_inode_dirty(inode); } return ret; } static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr) { return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits); } static inline void dquot_alloc_block_nofail(struct inode *inode, qsize_t nr) { dquot_alloc_space_nofail(inode, nr << inode->i_blkbits); } static inline int dquot_alloc_block(struct inode *inode, qsize_t nr) { return dquot_alloc_space(inode, nr << inode->i_blkbits); } static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0); } static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr) { int ret; ret = dquot_prealloc_block_nodirty(inode, nr); if (!ret) mark_inode_dirty_sync(inode); return ret; } static inline int dquot_reserve_block(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE); } static inline void dquot_claim_block(struct inode *inode, qsize_t nr) { dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); mark_inode_dirty_sync(inode); } static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr) { dquot_reclaim_space_nodirty(inode, nr << inode->i_blkbits); mark_inode_dirty_sync(inode); } static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr) { __dquot_free_space(inode, nr, 0); } static inline void dquot_free_space(struct inode *inode, qsize_t nr) { dquot_free_space_nodirty(inode, nr); mark_inode_dirty_sync(inode); } static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr) { dquot_free_space_nodirty(inode, nr << inode->i_blkbits); } static inline void dquot_free_block(struct inode *inode, qsize_t nr) { dquot_free_space(inode, nr << inode->i_blkbits); } static inline void dquot_release_reservation_block(struct inode *inode, qsize_t nr) { __dquot_free_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_RESERVE); } unsigned int qtype_enforce_flag(int type); #endif /* _LINUX_QUOTAOPS_ */
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * * See COPYING in top-level directory. */ #include "protocol.h" #include "orangefs-kernel.h" /* tags assigned to kernel upcall operations */ static __u64 next_tag_value; static DEFINE_SPINLOCK(next_tag_value_lock); /* the orangefs memory caches */ /* a cache for orangefs upcall/downcall operations */ static struct kmem_cache *op_cache; int op_cache_initialize(void) { op_cache = kmem_cache_create("orangefs_op_cache", sizeof(struct orangefs_kernel_op_s), 0, 0, NULL); if (!op_cache) { gossip_err("Cannot create orangefs_op_cache\n"); return -ENOMEM; } /* initialize our atomic tag counter */ spin_lock(&next_tag_value_lock); next_tag_value = 100; spin_unlock(&next_tag_value_lock); return 0; } int op_cache_finalize(void) { kmem_cache_destroy(op_cache); return 0; } char *get_opname_string(struct orangefs_kernel_op_s *new_op) { if (new_op) { __s32 type = new_op->upcall.type; if (type == ORANGEFS_VFS_OP_FILE_IO) return "OP_FILE_IO"; else if (type == ORANGEFS_VFS_OP_LOOKUP) return "OP_LOOKUP"; else if (type == ORANGEFS_VFS_OP_CREATE) return "OP_CREATE"; else if (type == ORANGEFS_VFS_OP_GETATTR) return "OP_GETATTR"; else if (type == ORANGEFS_VFS_OP_REMOVE) return "OP_REMOVE"; else if (type == ORANGEFS_VFS_OP_MKDIR) return "OP_MKDIR"; else if (type == ORANGEFS_VFS_OP_READDIR) return "OP_READDIR"; else if (type == ORANGEFS_VFS_OP_READDIRPLUS) return "OP_READDIRPLUS"; else if (type == ORANGEFS_VFS_OP_SETATTR) return "OP_SETATTR"; else if (type == ORANGEFS_VFS_OP_SYMLINK) return "OP_SYMLINK"; else if (type == ORANGEFS_VFS_OP_RENAME) return "OP_RENAME"; else if (type == ORANGEFS_VFS_OP_STATFS) return "OP_STATFS"; else if (type == ORANGEFS_VFS_OP_TRUNCATE) return "OP_TRUNCATE"; else if (type == ORANGEFS_VFS_OP_RA_FLUSH) return "OP_RA_FLUSH"; else if (type == ORANGEFS_VFS_OP_FS_MOUNT) return "OP_FS_MOUNT"; else if (type == ORANGEFS_VFS_OP_FS_UMOUNT) return "OP_FS_UMOUNT"; else if (type == ORANGEFS_VFS_OP_GETXATTR) return "OP_GETXATTR"; else if (type == ORANGEFS_VFS_OP_SETXATTR) return "OP_SETXATTR"; else if (type == ORANGEFS_VFS_OP_LISTXATTR) return "OP_LISTXATTR"; else if (type == ORANGEFS_VFS_OP_REMOVEXATTR) return "OP_REMOVEXATTR"; else if (type == ORANGEFS_VFS_OP_PARAM) return "OP_PARAM"; else if (type == ORANGEFS_VFS_OP_PERF_COUNT) return "OP_PERF_COUNT"; else if (type == ORANGEFS_VFS_OP_CANCEL) return "OP_CANCEL"; else if (type == ORANGEFS_VFS_OP_FSYNC) return "OP_FSYNC"; else if (type == ORANGEFS_VFS_OP_FSKEY) return "OP_FSKEY"; else if (type == ORANGEFS_VFS_OP_FEATURES) return "OP_FEATURES"; } return "OP_UNKNOWN?"; } void orangefs_new_tag(struct orangefs_kernel_op_s *op) { spin_lock(&next_tag_value_lock); op->tag = next_tag_value++; if (next_tag_value == 0) next_tag_value = 100; spin_unlock(&next_tag_value_lock); } struct orangefs_kernel_op_s *op_alloc(__s32 type) { struct orangefs_kernel_op_s *new_op = NULL; new_op = kmem_cache_zalloc(op_cache, GFP_KERNEL); if (new_op) { INIT_LIST_HEAD(&new_op->list); spin_lock_init(&new_op->lock); init_completion(&new_op->waitq); new_op->upcall.type = ORANGEFS_VFS_OP_INVALID; new_op->downcall.type = ORANGEFS_VFS_OP_INVALID; new_op->downcall.status = -1; new_op->op_state = OP_VFS_STATE_UNKNOWN; /* initialize the op specific tag and upcall credentials */ orangefs_new_tag(new_op); new_op->upcall.type = type; new_op->attempts = 0; gossip_debug(GOSSIP_CACHE_DEBUG, "Alloced OP (%p: %llu %s)\n", new_op, llu(new_op->tag), get_opname_string(new_op)); new_op->upcall.uid = from_kuid(&init_user_ns, current_fsuid()); new_op->upcall.gid = from_kgid(&init_user_ns, current_fsgid()); } else { gossip_err("op_alloc: kmem_cache_zalloc failed!\n"); } return new_op; } void op_release(struct orangefs_kernel_op_s *orangefs_op) { if (orangefs_op) { gossip_debug(GOSSIP_CACHE_DEBUG, "Releasing OP (%p: %llu)\n", orangefs_op, llu(orangefs_op->tag)); kmem_cache_free(op_cache, orangefs_op); } else { gossip_err("NULL pointer in op_release\n"); } }
25 4 13 8 6 3 10 8 6 12 2 29 29 29 3 26 28 1 29 28 29 8 17 5 17 1 25 7 25 2 2 2 1 2 1 1 1 1 1 1 1 11 11 11 10 11 10 57 58 57 3 1 31 26 3 2 29 2 1 1 26 26 26 41 1 7 4 2 7 26 37 37 24 1 25 24 32 21 21 11 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 // SPDX-License-Identifier: GPL-2.0 /* * Functions related to mapping data to requests */ #include <linux/kernel.h> #include <linux/sched/task_stack.h> #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/uio.h> #include "blk.h" struct bio_map_data { bool is_our_pages : 1; bool is_null_mapped : 1; struct iov_iter iter; struct iovec iov[]; }; static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, gfp_t gfp_mask) { struct bio_map_data *bmd; if (data->nr_segs > UIO_MAXIOV) return NULL; bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); if (!bmd) return NULL; bmd->iter = *data; if (iter_is_iovec(data)) { memcpy(bmd->iov, iter_iov(data), sizeof(struct iovec) * data->nr_segs); bmd->iter.__iov = bmd->iov; } return bmd; } /** * bio_copy_from_iter - copy all pages from iov_iter to bio * @bio: The &struct bio which describes the I/O as destination * @iter: iov_iter as source * * Copy all pages from iov_iter to bio. * Returns 0 on success, or error on failure. */ static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) { struct bio_vec *bvec; struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { ssize_t ret; ret = copy_page_from_iter(bvec->bv_page, bvec->bv_offset, bvec->bv_len, iter); if (!iov_iter_count(iter)) break; if (ret < bvec->bv_len) return -EFAULT; } return 0; } /** * bio_copy_to_iter - copy all pages from bio to iov_iter * @bio: The &struct bio which describes the I/O as source * @iter: iov_iter as destination * * Copy all pages from bio to iov_iter. * Returns 0 on success, or error on failure. */ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) { struct bio_vec *bvec; struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { ssize_t ret; ret = copy_page_to_iter(bvec->bv_page, bvec->bv_offset, bvec->bv_len, &iter); if (!iov_iter_count(&iter)) break; if (ret < bvec->bv_len) return -EFAULT; } return 0; } /** * bio_uncopy_user - finish previously mapped bio * @bio: bio being terminated * * Free pages allocated from bio_copy_user_iov() and write back data * to user space in case of a read. */ static int bio_uncopy_user(struct bio *bio) { struct bio_map_data *bmd = bio->bi_private; int ret = 0; if (!bmd->is_null_mapped) { /* * if we're in a workqueue, the request is orphaned, so * don't copy into a random user address space, just free * and return -EINTR so user space doesn't expect any data. */ if (!current->mm) ret = -EINTR; else if (bio_data_dir(bio) == READ) ret = bio_copy_to_iter(bio, bmd->iter); if (bmd->is_our_pages) bio_free_pages(bio); } kfree(bmd); return ret; } static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, struct iov_iter *iter, gfp_t gfp_mask) { struct bio_map_data *bmd; struct page *page; struct bio *bio; int i = 0, ret; int nr_pages; unsigned int len = iter->count; unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; bmd = bio_alloc_map_data(iter, gfp_mask); if (!bmd) return -ENOMEM; /* * We need to do a deep copy of the iov_iter including the iovecs. * The caller provided iov might point to an on-stack or otherwise * shortlived one. */ bmd->is_our_pages = !map_data; bmd->is_null_mapped = (map_data && map_data->null_mapped); nr_pages = bio_max_segs(DIV_ROUND_UP(offset + len, PAGE_SIZE)); ret = -ENOMEM; bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) goto out_bmd; bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); if (map_data) { nr_pages = 1U << map_data->page_order; i = map_data->offset / PAGE_SIZE; } while (len) { unsigned int bytes = PAGE_SIZE; bytes -= offset; if (bytes > len) bytes = len; if (map_data) { if (i == map_data->nr_entries * nr_pages) { ret = -ENOMEM; goto cleanup; } page = map_data->pages[i / nr_pages]; page += (i % nr_pages); i++; } else { page = alloc_page(GFP_NOIO | gfp_mask); if (!page) { ret = -ENOMEM; goto cleanup; } } if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) { if (!map_data) __free_page(page); break; } len -= bytes; offset = 0; } if (map_data) map_data->offset += bio->bi_iter.bi_size; /* * success */ if (iov_iter_rw(iter) == WRITE && (!map_data || !map_data->null_mapped)) { ret = bio_copy_from_iter(bio, iter); if (ret) goto cleanup; } else if (map_data && map_data->from_user) { struct iov_iter iter2 = *iter; /* This is the copy-in part of SG_DXFER_TO_FROM_DEV. */ iter2.data_source = ITER_SOURCE; ret = bio_copy_from_iter(bio, &iter2); if (ret) goto cleanup; } else { if (bmd->is_our_pages) zero_fill_bio(bio); iov_iter_advance(iter, bio->bi_iter.bi_size); } bio->bi_private = bmd; ret = blk_rq_append_bio(rq, bio); if (ret) goto cleanup; return 0; cleanup: if (!map_data) bio_free_pages(bio); bio_uninit(bio); kfree(bio); out_bmd: kfree(bmd); return ret; } static void blk_mq_map_bio_put(struct bio *bio) { if (bio->bi_opf & REQ_ALLOC_CACHE) { bio_put(bio); } else { bio_uninit(bio); kfree(bio); } } static struct bio *blk_rq_map_bio_alloc(struct request *rq, unsigned int nr_vecs, gfp_t gfp_mask) { struct bio *bio; if (rq->cmd_flags & REQ_ALLOC_CACHE && (nr_vecs <= BIO_INLINE_VECS)) { bio = bio_alloc_bioset(NULL, nr_vecs, rq->cmd_flags, gfp_mask, &fs_bio_set); if (!bio) return NULL; } else { bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) return NULL; bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); } return bio; } static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, gfp_t gfp_mask) { iov_iter_extraction_t extraction_flags = 0; unsigned int max_sectors = queue_max_hw_sectors(rq->q); unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); struct bio *bio; int ret; int j; if (!iov_iter_count(iter)) return -EINVAL; bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask); if (bio == NULL) return -ENOMEM; if (blk_queue_pci_p2pdma(rq->q)) extraction_flags |= ITER_ALLOW_P2PDMA; if (iov_iter_extract_will_pin(iter)) bio_set_flag(bio, BIO_PAGE_PINNED); while (iov_iter_count(iter)) { struct page *stack_pages[UIO_FASTIOV]; struct page **pages = stack_pages; ssize_t bytes; size_t offs; int npages; if (nr_vecs > ARRAY_SIZE(stack_pages)) pages = NULL; bytes = iov_iter_extract_pages(iter, &pages, LONG_MAX, nr_vecs, extraction_flags, &offs); if (unlikely(bytes <= 0)) { ret = bytes ? bytes : -EFAULT; goto out_unmap; } npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); if (unlikely(offs & queue_dma_alignment(rq->q))) j = 0; else { for (j = 0; j < npages; j++) { struct page *page = pages[j]; unsigned int n = PAGE_SIZE - offs; bool same_page = false; if (n > bytes) n = bytes; if (!bio_add_hw_page(rq->q, bio, page, n, offs, max_sectors, &same_page)) break; if (same_page) bio_release_page(bio, page); bytes -= n; offs = 0; } } /* * release the pages we didn't map into the bio, if any */ while (j < npages) bio_release_page(bio, pages[j++]); if (pages != stack_pages) kvfree(pages); /* couldn't stuff something into bio? */ if (bytes) { iov_iter_revert(iter, bytes); break; } } ret = blk_rq_append_bio(rq, bio); if (ret) goto out_unmap; return 0; out_unmap: bio_release_pages(bio, false); blk_mq_map_bio_put(bio); return ret; } static void bio_invalidate_vmalloc_pages(struct bio *bio) { #ifdef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE if (bio->bi_private && !op_is_write(bio_op(bio))) { unsigned long i, len = 0; for (i = 0; i < bio->bi_vcnt; i++) len += bio->bi_io_vec[i].bv_len; invalidate_kernel_vmap_range(bio->bi_private, len); } #endif } static void bio_map_kern_endio(struct bio *bio) { bio_invalidate_vmalloc_pages(bio); bio_uninit(bio); kfree(bio); } /** * bio_map_kern - map kernel address into bio * @q: the struct request_queue for the bio * @data: pointer to buffer to map * @len: length in bytes * @gfp_mask: allocation flags for bio allocation * * Map the kernel address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ static struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask) { unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = kaddr >> PAGE_SHIFT; const int nr_pages = end - start; bool is_vmalloc = is_vmalloc_addr(data); struct page *page; int offset, i; struct bio *bio; bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); if (is_vmalloc) { flush_kernel_vmap_range(data, len); bio->bi_private = data; } offset = offset_in_page(kaddr); for (i = 0; i < nr_pages; i++) { unsigned int bytes = PAGE_SIZE - offset; if (len <= 0) break; if (bytes > len) bytes = len; if (!is_vmalloc) page = virt_to_page(data); else page = vmalloc_to_page(data); if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { /* we don't support partial mappings */ bio_uninit(bio); kfree(bio); return ERR_PTR(-EINVAL); } data += bytes; len -= bytes; offset = 0; } bio->bi_end_io = bio_map_kern_endio; return bio; } static void bio_copy_kern_endio(struct bio *bio) { bio_free_pages(bio); bio_uninit(bio); kfree(bio); } static void bio_copy_kern_endio_read(struct bio *bio) { char *p = bio->bi_private; struct bio_vec *bvec; struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { memcpy_from_bvec(p, bvec); p += bvec->bv_len; } bio_copy_kern_endio(bio); } /** * bio_copy_kern - copy kernel address into bio * @q: the struct request_queue for the bio * @data: pointer to buffer to copy * @len: length in bytes * @gfp_mask: allocation flags for bio and page allocation * @reading: data direction is READ * * copy the kernel address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ static struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask, int reading) { unsigned long kaddr = (unsigned long)data; unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; unsigned long start = kaddr >> PAGE_SHIFT; struct bio *bio; void *p = data; int nr_pages = 0; /* * Overflow, abort */ if (end < start) return ERR_PTR(-EINVAL); nr_pages = end - start; bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); while (len) { struct page *page; unsigned int bytes = PAGE_SIZE; if (bytes > len) bytes = len; page = alloc_page(GFP_NOIO | __GFP_ZERO | gfp_mask); if (!page) goto cleanup; if (!reading) memcpy(page_address(page), p, bytes); if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) break; len -= bytes; p += bytes; } if (reading) { bio->bi_end_io = bio_copy_kern_endio_read; bio->bi_private = data; } else { bio->bi_end_io = bio_copy_kern_endio; } return bio; cleanup: bio_free_pages(bio); bio_uninit(bio); kfree(bio); return ERR_PTR(-ENOMEM); } /* * Append a bio to a passthrough request. Only works if the bio can be merged * into the request based on the driver constraints. */ int blk_rq_append_bio(struct request *rq, struct bio *bio) { struct bvec_iter iter; struct bio_vec bv; unsigned int nr_segs = 0; bio_for_each_bvec(bv, bio, iter) nr_segs++; if (!rq->bio) { blk_rq_bio_prep(rq, bio, nr_segs); } else { if (!ll_back_merge_fn(rq, bio, nr_segs)) return -EINVAL; rq->biotail->bi_next = bio; rq->biotail = bio; rq->__data_len += (bio)->bi_iter.bi_size; bio_crypt_free_ctx(bio); } return 0; } EXPORT_SYMBOL(blk_rq_append_bio); /* Prepare bio for passthrough IO given ITER_BVEC iter */ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) { const struct queue_limits *lim = &rq->q->limits; unsigned int max_bytes = lim->max_hw_sectors << SECTOR_SHIFT; unsigned int nsegs; struct bio *bio; int ret; if (!iov_iter_count(iter) || iov_iter_count(iter) > max_bytes) return -EINVAL; /* reuse the bvecs from the iterator instead of allocating new ones */ bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL); if (!bio) return -ENOMEM; bio_iov_bvec_set(bio, iter); /* check that the data layout matches the hardware restrictions */ ret = bio_split_rw_at(bio, lim, &nsegs, max_bytes); if (ret) { /* if we would have to split the bio, copy instead */ if (ret > 0) ret = -EREMOTEIO; blk_mq_map_bio_put(bio); return ret; } blk_rq_bio_prep(rq, bio, nsegs); return 0; } /** * blk_rq_map_user_iov - map user data to a request, for passthrough requests * @q: request queue where request should be inserted * @rq: request to map data to * @map_data: pointer to the rq_map_data holding pages (if necessary) * @iter: iovec iterator * @gfp_mask: memory allocation flags * * Description: * Data will be mapped directly for zero copy I/O, if possible. Otherwise * a kernel bounce buffer is used. * * A matching blk_rq_unmap_user() must be issued at the end of I/O, while * still in process context. */ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { bool copy = false, map_bvec = false; unsigned long align = blk_lim_dma_alignment_and_pad(&q->limits); struct bio *bio = NULL; struct iov_iter i; int ret = -EINVAL; if (map_data) copy = true; else if (blk_queue_may_bounce(q)) copy = true; else if (iov_iter_alignment(iter) & align) copy = true; else if (iov_iter_is_bvec(iter)) map_bvec = true; else if (!user_backed_iter(iter)) copy = true; else if (queue_virt_boundary(q)) copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); if (map_bvec) { ret = blk_rq_map_user_bvec(rq, iter); if (!ret) return 0; if (ret != -EREMOTEIO) goto fail; /* fall back to copying the data on limits mismatches */ copy = true; } i = *iter; do { if (copy) ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask); else ret = bio_map_user_iov(rq, &i, gfp_mask); if (ret) goto unmap_rq; if (!bio) bio = rq->bio; } while (iov_iter_count(&i)); return 0; unmap_rq: blk_rq_unmap_user(bio); fail: rq->bio = NULL; return ret; } EXPORT_SYMBOL(blk_rq_map_user_iov); int blk_rq_map_user(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, void __user *ubuf, unsigned long len, gfp_t gfp_mask) { struct iov_iter i; int ret = import_ubuf(rq_data_dir(rq), ubuf, len, &i); if (unlikely(ret < 0)) return ret; return blk_rq_map_user_iov(q, rq, map_data, &i, gfp_mask); } EXPORT_SYMBOL(blk_rq_map_user); int blk_rq_map_user_io(struct request *req, struct rq_map_data *map_data, void __user *ubuf, unsigned long buf_len, gfp_t gfp_mask, bool vec, int iov_count, bool check_iter_count, int rw) { int ret = 0; if (vec) { struct iovec fast_iov[UIO_FASTIOV]; struct iovec *iov = fast_iov; struct iov_iter iter; ret = import_iovec(rw, ubuf, iov_count ? iov_count : buf_len, UIO_FASTIOV, &iov, &iter); if (ret < 0) return ret; if (iov_count) { /* SG_IO howto says that the shorter of the two wins */ iov_iter_truncate(&iter, buf_len); if (check_iter_count && !iov_iter_count(&iter)) { kfree(iov); return -EINVAL; } } ret = blk_rq_map_user_iov(req->q, req, map_data, &iter, gfp_mask); kfree(iov); } else if (buf_len) { ret = blk_rq_map_user(req->q, req, map_data, ubuf, buf_len, gfp_mask); } return ret; } EXPORT_SYMBOL(blk_rq_map_user_io); /** * blk_rq_unmap_user - unmap a request with user data * @bio: start of bio list * * Description: * Unmap a rq previously mapped by blk_rq_map_user(). The caller must * supply the original rq->bio from the blk_rq_map_user() return, since * the I/O completion may have changed rq->bio. */ int blk_rq_unmap_user(struct bio *bio) { struct bio *next_bio; int ret = 0, ret2; while (bio) { if (bio->bi_private) { ret2 = bio_uncopy_user(bio); if (ret2 && !ret) ret = ret2; } else { bio_release_pages(bio, bio_data_dir(bio) == READ); } if (bio_integrity(bio)) bio_integrity_unmap_user(bio); next_bio = bio; bio = bio->bi_next; blk_mq_map_bio_put(next_bio); } return ret; } EXPORT_SYMBOL(blk_rq_unmap_user); /** * blk_rq_map_kern - map kernel data to a request, for passthrough requests * @q: request queue where request should be inserted * @rq: request to fill * @kbuf: the kernel buffer * @len: length of user data * @gfp_mask: memory allocation flags * * Description: * Data will be mapped directly if possible. Otherwise a bounce * buffer is used. Can be called multiple times to append multiple * buffers. */ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, unsigned int len, gfp_t gfp_mask) { int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; struct bio *bio; int ret; if (len > (queue_max_hw_sectors(q) << 9)) return -EINVAL; if (!len || !kbuf) return -EINVAL; if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) || blk_queue_may_bounce(q)) bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); else bio = bio_map_kern(q, kbuf, len, gfp_mask); if (IS_ERR(bio)) return PTR_ERR(bio); bio->bi_opf &= ~REQ_OP_MASK; bio->bi_opf |= req_op(rq); ret = blk_rq_append_bio(rq, bio); if (unlikely(ret)) { bio_uninit(bio); kfree(bio); } return ret; } EXPORT_SYMBOL(blk_rq_map_kern);
44 44 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 /* * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/in.h> #include <net/tcp.h> #include <trace/events/sock.h> #include "rds.h" #include "tcp.h" void rds_tcp_keepalive(struct socket *sock) { /* values below based on xs_udp_default_timeout */ int keepidle = 5; /* send a probe 'keepidle' secs after last data */ int keepcnt = 5; /* number of unack'ed probes before declaring dead */ sock_set_keepalive(sock->sk); tcp_sock_set_keepcnt(sock->sk, keepcnt); tcp_sock_set_keepidle(sock->sk, keepidle); /* KEEPINTVL is the interval between successive probes. We follow * the model in xs_tcp_finish_connecting() and re-use keepidle. */ tcp_sock_set_keepintvl(sock->sk, keepidle); } /* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the * client's ipaddr < server's ipaddr. Otherwise, close the accepted * socket and force a reconneect from smaller -> larger ip addr. The reason * we special case cp_index 0 is to allow the rds probe ping itself to itself * get through efficiently. * Since reconnects are only initiated from the node with the numerically * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side * by moving them to CONNECTING in this function. */ static struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) { int i; int npaths = max_t(int, 1, conn->c_npaths); /* for mprds, all paths MUST be initiated by the peer * with the smaller address. */ if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) { /* Make sure we initiate at least one path if this * has not already been done; rds_start_mprds() will * take care of additional paths, if necessary. */ if (npaths == 1) rds_conn_path_connect_if_down(&conn->c_path[0]); return NULL; } for (i = 0; i < npaths; i++) { struct rds_conn_path *cp = &conn->c_path[i]; if (rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING) || rds_conn_path_transition(cp, RDS_CONN_ERROR, RDS_CONN_CONNECTING)) { return cp->cp_transport_data; } } return NULL; } int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; struct rds_connection *conn; int ret; struct inet_sock *inet; struct rds_tcp_connection *rs_tcp = NULL; int conn_state; struct rds_conn_path *cp; struct in6_addr *my_addr, *peer_addr; struct proto_accept_arg arg = { .flags = O_NONBLOCK, .kern = true, }; #if !IS_ENABLED(CONFIG_IPV6) struct in6_addr saddr, daddr; #endif int dev_if = 0; if (!sock) /* module unload or netns delete in progress */ return -ENETUNREACH; ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, &new_sock); if (ret) goto out; ret = sock->ops->accept(sock, new_sock, &arg); if (ret < 0) goto out; /* sock_create_lite() does not get a hold on the owner module so we * need to do it here. Note that sock_release() uses sock->ops to * determine if it needs to decrement the reference count. So set * sock->ops after calling accept() in case that fails. And there's * no need to do try_module_get() as the listener should have a hold * already. */ new_sock->ops = sock->ops; __module_get(new_sock->ops->owner); rds_tcp_keepalive(new_sock); if (!rds_tcp_tune(new_sock)) { ret = -EINVAL; goto out; } inet = inet_sk(new_sock->sk); #if IS_ENABLED(CONFIG_IPV6) my_addr = &new_sock->sk->sk_v6_rcv_saddr; peer_addr = &new_sock->sk->sk_v6_daddr; #else ipv6_addr_set_v4mapped(inet->inet_saddr, &saddr); ipv6_addr_set_v4mapped(inet->inet_daddr, &daddr); my_addr = &saddr; peer_addr = &daddr; #endif rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n", sock->sk->sk_family, my_addr, ntohs(inet->inet_sport), peer_addr, ntohs(inet->inet_dport)); #if IS_ENABLED(CONFIG_IPV6) /* sk_bound_dev_if is not set if the peer address is not link local * address. In this case, it happens that mcast_oif is set. So * just use it. */ if ((ipv6_addr_type(my_addr) & IPV6_ADDR_LINKLOCAL) && !(ipv6_addr_type(peer_addr) & IPV6_ADDR_LINKLOCAL)) { struct ipv6_pinfo *inet6; inet6 = inet6_sk(new_sock->sk); dev_if = READ_ONCE(inet6->mcast_oif); } else { dev_if = new_sock->sk->sk_bound_dev_if; } #endif if (!rds_tcp_laddr_check(sock_net(sock->sk), peer_addr, dev_if)) { /* local address connection is only allowed via loopback */ ret = -EOPNOTSUPP; goto out; } conn = rds_conn_create(sock_net(sock->sk), my_addr, peer_addr, &rds_tcp_transport, 0, GFP_KERNEL, dev_if); if (IS_ERR(conn)) { ret = PTR_ERR(conn); goto out; } /* An incoming SYN request came in, and TCP just accepted it. * * If the client reboots, this conn will need to be cleaned up. * rds_tcp_state_change() will do that cleanup */ rs_tcp = rds_tcp_accept_one_path(conn); if (!rs_tcp) goto rst_nsk; mutex_lock(&rs_tcp->t_conn_path_lock); cp = rs_tcp->t_cpath; conn_state = rds_conn_path_state(cp); WARN_ON(conn_state == RDS_CONN_UP); if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) goto rst_nsk; if (rs_tcp->t_sock) { /* Duelling SYN has been handled in rds_tcp_accept_one() */ rds_tcp_reset_callbacks(new_sock, cp); /* rds_connect_path_complete() marks RDS_CONN_UP */ rds_connect_path_complete(cp, RDS_CONN_RESETTING); } else { rds_tcp_set_callbacks(new_sock, cp); rds_connect_path_complete(cp, RDS_CONN_CONNECTING); } new_sock = NULL; ret = 0; if (conn->c_npaths == 0) rds_send_ping(cp->cp_conn, cp->cp_index); goto out; rst_nsk: /* reset the newly returned accept sock and bail. * It is safe to set linger on new_sock because the RDS connection * has not been brought up on new_sock, so no RDS-level data could * be pending on it. By setting linger, we achieve the side-effect * of avoiding TIME_WAIT state on new_sock. */ sock_no_linger(new_sock->sk); kernel_sock_shutdown(new_sock, SHUT_RDWR); ret = 0; out: if (rs_tcp) mutex_unlock(&rs_tcp->t_conn_path_lock); if (new_sock) sock_release(new_sock); return ret; } void rds_tcp_listen_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); trace_sk_data_ready(sk); rdsdebug("listen data ready sk %p\n", sk); read_lock_bh(&sk->sk_callback_lock); ready = sk->sk_user_data; if (!ready) { /* check for teardown race */ ready = sk->sk_data_ready; goto out; } /* * ->sk_data_ready is also called for a newly established child socket * before it has been accepted and the accepter has set up their * data_ready.. we only want to queue listen work for our listening * socket * * (*ready)() may be null if we are racing with netns delete, and * the listen socket is being torn down. */ if (sk->sk_state == TCP_LISTEN) rds_tcp_accept_work(sk); else ready = rds_tcp_listen_sock_def_readable(sock_net(sk)); out: read_unlock_bh(&sk->sk_callback_lock); if (ready) ready(sk); } struct socket *rds_tcp_listen_init(struct net *net, bool isv6) { struct socket *sock = NULL; struct sockaddr_storage ss; struct sockaddr_in6 *sin6; struct sockaddr_in *sin; int addr_len; int ret; ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret < 0) { rdsdebug("could not create %s listener socket: %d\n", isv6 ? "IPv6" : "IPv4", ret); goto out; } sock->sk->sk_reuse = SK_CAN_REUSE; tcp_sock_set_nodelay(sock->sk); write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = sock->sk->sk_data_ready; sock->sk->sk_data_ready = rds_tcp_listen_data_ready; write_unlock_bh(&sock->sk->sk_callback_lock); if (isv6) { sin6 = (struct sockaddr_in6 *)&ss; sin6->sin6_family = PF_INET6; sin6->sin6_addr = in6addr_any; sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT); sin6->sin6_scope_id = 0; sin6->sin6_flowinfo = 0; addr_len = sizeof(*sin6); } else { sin = (struct sockaddr_in *)&ss; sin->sin_family = PF_INET; sin->sin_addr.s_addr = INADDR_ANY; sin->sin_port = (__force u16)htons(RDS_TCP_PORT); addr_len = sizeof(*sin); } ret = kernel_bind(sock, (struct sockaddr *)&ss, addr_len); if (ret < 0) { rdsdebug("could not bind %s listener socket: %d\n", isv6 ? "IPv6" : "IPv4", ret); goto out; } ret = sock->ops->listen(sock, 64); if (ret < 0) goto out; return sock; out: if (sock) sock_release(sock); return NULL; } void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor) { struct sock *sk; if (!sock) return; sk = sock->sk; /* serialize with and prevent further callbacks */ lock_sock(sk); write_lock_bh(&sk->sk_callback_lock); if (sk->sk_user_data) { sk->sk_data_ready = sk->sk_user_data; sk->sk_user_data = NULL; } write_unlock_bh(&sk->sk_callback_lock); release_sock(sk); /* wait for accepts to stop and close the socket */ flush_workqueue(rds_wq); flush_work(acceptor); sock_release(sock); }
12592 15730 5 8589 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion * * Copyright IBM Corporation, 2001 * * Author: Dipankar Sarma <dipankar@in.ibm.com> * * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) * * For detailed explanation of Read-Copy Update mechanism see - * http://lse.sourceforge.net/locking/rcupdate.html * */ #ifndef __LINUX_RCUPDATE_H #define __LINUX_RCUPDATE_H #include <linux/types.h> #include <linux/compiler.h> #include <linux/atomic.h> #include <linux/irqflags.h> #include <linux/preempt.h> #include <linux/bottom_half.h> #include <linux/lockdep.h> #include <linux/cleanup.h> #include <asm/processor.h> #include <linux/context_tracking_irq.h> #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) #define RCU_SEQ_CTR_SHIFT 2 #define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); void synchronize_rcu(void); struct rcu_gp_oldstate; unsigned long get_completed_synchronize_rcu(void); void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp); // Maximum number of unsigned long values corresponding to // not-yet-completed RCU grace periods. #define NUM_ACTIVE_RCU_POLL_OLDSTATE 2 /** * same_state_synchronize_rcu - Are two old-state values identical? * @oldstate1: First old-state value. * @oldstate2: Second old-state value. * * The two old-state values must have been obtained from either * get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or * get_completed_synchronize_rcu(). Returns @true if the two values are * identical and @false otherwise. This allows structures whose lifetimes * are tracked by old-state values to push these values to a list header, * allowing those structures to be slightly smaller. */ static inline bool same_state_synchronize_rcu(unsigned long oldstate1, unsigned long oldstate2) { return oldstate1 == oldstate2; } #ifdef CONFIG_PREEMPT_RCU void __rcu_read_lock(void); void __rcu_read_unlock(void); /* * Defined as a macro as it is a very low level header included from * areas that don't even know about current. This gives the rcu_read_lock() * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. */ #define rcu_preempt_depth() READ_ONCE(current->rcu_read_lock_nesting) #else /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TINY_RCU #define rcu_read_unlock_strict() do { } while (0) #else void rcu_read_unlock_strict(void); #endif static inline void __rcu_read_lock(void) { preempt_disable(); } static inline void __rcu_read_unlock(void) { preempt_enable(); if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) rcu_read_unlock_strict(); } static inline int rcu_preempt_depth(void) { return 0; } #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_RCU_LAZY void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func); #else static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) { call_rcu(head, func); } #endif /* Internal to kernel */ void rcu_init(void); extern int rcu_scheduler_active; void rcu_sched_clock_irq(int user); #ifdef CONFIG_TASKS_RCU_GENERIC void rcu_init_tasks_generic(void); #else static inline void rcu_init_tasks_generic(void) { } #endif #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); #else /* #ifdef CONFIG_RCU_STALL_COMMON */ static inline void rcu_sysrq_start(void) { } static inline void rcu_sysrq_end(void) { } #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */ #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) void rcu_irq_work_resched(void); #else static inline void rcu_irq_work_resched(void) { } #endif #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); int rcu_nocb_cpu_offload(int cpu); int rcu_nocb_cpu_deoffload(int cpu); void rcu_nocb_flush_deferred_wakeup(void); #define RCU_NOCB_LOCKDEP_WARN(c, s) RCU_LOCKDEP_WARN(c, s) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_init_nohz(void) { } static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; } static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; } static inline void rcu_nocb_flush_deferred_wakeup(void) { } #define RCU_NOCB_LOCKDEP_WARN(c, s) #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /* * Note a quasi-voluntary context switch for RCU-tasks's benefit. * This is a macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU_GENERIC # ifdef CONFIG_TASKS_RCU # define rcu_tasks_classic_qs(t, preempt) \ do { \ if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout)) \ WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); void rcu_tasks_torture_stats_print(char *tt, char *tf); # else # define rcu_tasks_classic_qs(t, preempt) do { } while (0) # define call_rcu_tasks call_rcu # define synchronize_rcu_tasks synchronize_rcu # endif # ifdef CONFIG_TASKS_TRACE_RCU // Bits for ->trc_reader_special.b.need_qs field. #define TRC_NEED_QS 0x1 // Task needs a quiescent state. #define TRC_NEED_QS_CHECKED 0x2 // Task has been checked for needing quiescent state. u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new); void rcu_tasks_trace_qs_blkd(struct task_struct *t); # define rcu_tasks_trace_qs(t) \ do { \ int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting); \ \ if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) && \ likely(!___rttq_nesting)) { \ rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED); \ } else if (___rttq_nesting && ___rttq_nesting != INT_MIN && \ !READ_ONCE((t)->trc_reader_special.b.blocked)) { \ rcu_tasks_trace_qs_blkd(t); \ } \ } while (0) void rcu_tasks_trace_torture_stats_print(char *tt, char *tf); # else # define rcu_tasks_trace_qs(t) do { } while (0) # endif #define rcu_tasks_qs(t, preempt) \ do { \ rcu_tasks_classic_qs((t), (preempt)); \ rcu_tasks_trace_qs(t); \ } while (0) # ifdef CONFIG_TASKS_RUDE_RCU void synchronize_rcu_tasks_rude(void); void rcu_tasks_rude_torture_stats_print(char *tt, char *tf); # endif #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ #define rcu_tasks_classic_qs(t, preempt) do { } while (0) #define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ /** * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period? * * As an accident of implementation, an RCU Tasks Trace grace period also * acts as an RCU grace period. However, this could change at any time. * Code relying on this accident must call this function to verify that * this accident is still happening. * * You have been warned! */ static inline bool rcu_trace_implies_rcu_gp(void) { return true; } /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * * This macro resembles cond_resched(), except that it is defined to * report potential quiescent states to RCU-tasks even if the cond_resched() * machinery were to be shut off, as some advocate for PREEMPTION kernels. */ #define cond_resched_tasks_rcu_qs() \ do { \ rcu_tasks_qs(current, false); \ cond_resched(); \ } while (0) /** * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states * @old_ts: jiffies at start of processing. * * This helper is for long-running softirq handlers, such as NAPI threads in * networking. The caller should initialize the variable passed in as @old_ts * at the beginning of the softirq handler. When invoked frequently, this macro * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will * provide both RCU and RCU-Tasks quiescent states. Note that this macro * modifies its old_ts argument. * * Because regions of code that have disabled softirq act as RCU read-side * critical sections, this macro should be invoked with softirq (and * preemption) enabled. * * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would * have more chance to invoke schedule() calls and provide necessary quiescent * states. As a contrast, calling cond_resched() only won't achieve the same * effect because cond_resched() does not provide RCU-Tasks quiescent states. */ #define rcu_softirq_qs_periodic(old_ts) \ do { \ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \ time_after(jiffies, (old_ts) + HZ / 10)) { \ preempt_disable(); \ rcu_softirq_qs(); \ preempt_enable(); \ (old_ts) = jiffies; \ } \ } while (0) /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. */ #if defined(CONFIG_TREE_RCU) #include <linux/rcutree.h> #elif defined(CONFIG_TINY_RCU) #include <linux/rcutiny.h> #else #error "Unknown RCU implementation specified to kernel configuration" #endif /* * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls * are needed for dynamic initialization and destruction of rcu_head * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for * dynamic initialization and destruction of statically allocated rcu_head * structures. However, rcu_head structures allocated dynamically in the * heap don't need any initialization. */ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head); void destroy_rcu_head(struct rcu_head *head); void init_rcu_head_on_stack(struct rcu_head *head); void destroy_rcu_head_on_stack(struct rcu_head *head); #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ static inline void init_rcu_head(struct rcu_head *head) { } static inline void destroy_rcu_head(struct rcu_head *head) { } static inline void init_rcu_head_on_stack(struct rcu_head *head) { } static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) bool rcu_lockdep_current_cpu_online(void); #else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ static inline bool rcu_lockdep_current_cpu_online(void) { return true; } #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ extern struct lockdep_map rcu_lock_map; extern struct lockdep_map rcu_bh_lock_map; extern struct lockdep_map rcu_sched_lock_map; extern struct lockdep_map rcu_callback_map; #ifdef CONFIG_DEBUG_LOCK_ALLOC static inline void rcu_lock_acquire(struct lockdep_map *map) { lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_); } static inline void rcu_try_lock_acquire(struct lockdep_map *map) { lock_acquire(map, 0, 1, 2, 0, NULL, _THIS_IP_); } static inline void rcu_lock_release(struct lockdep_map *map) { lock_release(map, _THIS_IP_); } int debug_lockdep_rcu_enabled(void); int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); int rcu_read_lock_sched_held(void); int rcu_read_lock_any_held(void); #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ # define rcu_lock_acquire(a) do { } while (0) # define rcu_try_lock_acquire(a) do { } while (0) # define rcu_lock_release(a) do { } while (0) static inline int rcu_read_lock_held(void) { return 1; } static inline int rcu_read_lock_bh_held(void) { return 1; } static inline int rcu_read_lock_sched_held(void) { return !preemptible(); } static inline int rcu_read_lock_any_held(void) { return !preemptible(); } static inline int debug_lockdep_rcu_enabled(void) { return 0; } #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU /** * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met * @c: condition to check * @s: informative message * * This checks debug_lockdep_rcu_enabled() before checking (c) to * prevent early boot splats due to lockdep not yet being initialized, * and rechecks it after checking (c) to prevent false-positive splats * due to races with lockdep being disabled. See commit 3066820034b5dd * ("rcu: Reject RCU_LOCKDEP_WARN() false positives") for more detail. */ #define RCU_LOCKDEP_WARN(c, s) \ do { \ static bool __section(".data..unlikely") __warned; \ if (debug_lockdep_rcu_enabled() && (c) && \ debug_lockdep_rcu_enabled() && !__warned) { \ __warned = true; \ lockdep_rcu_suspicious(__FILE__, __LINE__, s); \ } \ } while (0) #ifndef CONFIG_PREEMPT_RCU static inline void rcu_preempt_sleep_check(void) { RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), "Illegal context switch in RCU read-side critical section"); } #else // #ifndef CONFIG_PREEMPT_RCU static inline void rcu_preempt_sleep_check(void) { } #endif // #else // #ifndef CONFIG_PREEMPT_RCU #define rcu_sleep_check() \ do { \ rcu_preempt_sleep_check(); \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ "Illegal context switch in RCU-bh read-side critical section"); \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \ "Illegal context switch in RCU-sched read-side critical section"); \ } while (0) // See RCU_LOCKDEP_WARN() for an explanation of the double call to // debug_lockdep_rcu_enabled(). static inline bool lockdep_assert_rcu_helper(bool c) { return debug_lockdep_rcu_enabled() && (c || !rcu_is_watching() || !rcu_lockdep_current_cpu_online()) && debug_lockdep_rcu_enabled(); } /** * lockdep_assert_in_rcu_read_lock - WARN if not protected by rcu_read_lock() * * Splats if lockdep is enabled and there is no rcu_read_lock() in effect. */ #define lockdep_assert_in_rcu_read_lock() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map))) /** * lockdep_assert_in_rcu_read_lock_bh - WARN if not protected by rcu_read_lock_bh() * * Splats if lockdep is enabled and there is no rcu_read_lock_bh() in effect. * Note that local_bh_disable() and friends do not suffice here, instead an * actual rcu_read_lock_bh() is required. */ #define lockdep_assert_in_rcu_read_lock_bh() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_bh_lock_map))) /** * lockdep_assert_in_rcu_read_lock_sched - WARN if not protected by rcu_read_lock_sched() * * Splats if lockdep is enabled and there is no rcu_read_lock_sched() * in effect. Note that preempt_disable() and friends do not suffice here, * instead an actual rcu_read_lock_sched() is required. */ #define lockdep_assert_in_rcu_read_lock_sched() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_sched_lock_map))) /** * lockdep_assert_in_rcu_reader - WARN if not within some type of RCU reader * * Splats if lockdep is enabled and there is no RCU reader of any * type in effect. Note that regions of code protected by things like * preempt_disable, local_bh_disable(), and local_irq_disable() all qualify * as RCU readers. * * Note that this will never trigger in PREEMPT_NONE or PREEMPT_VOLUNTARY * kernels that are not also built with PREEMPT_COUNT. But if you have * lockdep enabled, you might as well also enable PREEMPT_COUNT. */ #define lockdep_assert_in_rcu_reader() \ WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map) && \ !lock_is_held(&rcu_bh_lock_map) && \ !lock_is_held(&rcu_sched_lock_map) && \ preemptible())) #else /* #ifdef CONFIG_PROVE_RCU */ #define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c)) #define rcu_sleep_check() do { } while (0) #define lockdep_assert_in_rcu_read_lock() do { } while (0) #define lockdep_assert_in_rcu_read_lock_bh() do { } while (0) #define lockdep_assert_in_rcu_read_lock_sched() do { } while (0) #define lockdep_assert_in_rcu_reader() do { } while (0) #endif /* #else #ifdef CONFIG_PROVE_RCU */ /* * Helper functions for rcu_dereference_check(), rcu_dereference_protected() * and rcu_assign_pointer(). Some of these could be folded into their * callers, but they are left separate in order to ease introduction of * multiple pointers markings to match different RCU implementations * (e.g., __srcu), should this make sense in the future. */ #ifdef __CHECKER__ #define rcu_check_sparse(p, space) \ ((void)(((typeof(*p) space *)p) == p)) #else /* #ifdef __CHECKER__ */ #define rcu_check_sparse(p, space) #endif /* #else #ifdef __CHECKER__ */ #define __unrcu_pointer(p, local) \ ({ \ typeof(*p) *local = (typeof(*p) *__force)(p); \ rcu_check_sparse(p, __rcu); \ ((typeof(*p) __force __kernel *)(local)); \ }) /** * unrcu_pointer - mark a pointer as not being RCU protected * @p: pointer needing to lose its __rcu property * * Converts @p from an __rcu pointer to a __kernel pointer. * This allows an __rcu pointer to be used with xchg() and friends. */ #define unrcu_pointer(p) __unrcu_pointer(p, __UNIQUE_ID(rcu)) #define __rcu_access_pointer(p, local, space) \ ({ \ typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define __rcu_dereference_check(p, local, c, space) \ ({ \ /* Dependency order vs. p above. */ \ typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define __rcu_dereference_protected(p, local, c, space) \ ({ \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(p)); \ }) #define __rcu_dereference_raw(p, local) \ ({ \ /* Dependency order vs. p above. */ \ typeof(p) local = READ_ONCE(p); \ ((typeof(*p) __force __kernel *)(local)); \ }) #define rcu_dereference_raw(p) __rcu_dereference_raw(p, __UNIQUE_ID(rcu)) /** * RCU_INITIALIZER() - statically initialize an RCU-protected global variable * @v: The value to statically initialize with. */ #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) /** * rcu_assign_pointer() - assign to RCU-protected pointer * @p: pointer to assign to * @v: value to assign (publish) * * Assigns the specified value to the specified RCU-protected * pointer, ensuring that any concurrent RCU readers will see * any prior initialization. * * Inserts memory barriers on architectures that require them * (which is most of them), and also prevents the compiler from * reordering the code that initializes the structure after the pointer * assignment. More importantly, this call documents which pointers * will be dereferenced by RCU read-side code. * * In some special cases, you may use RCU_INIT_POINTER() instead * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due * to the fact that it does not constrain either the CPU or the compiler. * That said, using RCU_INIT_POINTER() when you should have used * rcu_assign_pointer() is a very bad thing that results in * impossible-to-diagnose memory corruption. So please be careful. * See the RCU_INIT_POINTER() comment header for details. * * Note that rcu_assign_pointer() evaluates each of its arguments only * once, appearances notwithstanding. One of the "extra" evaluations * is in typeof() and the other visible only to sparse (__CHECKER__), * neither of which actually execute the argument. As with most cpp * macros, this execute-arguments-only-once property is important, so * please be careful when making changes to rcu_assign_pointer() and the * other macros that it invokes. */ #define rcu_assign_pointer(p, v) \ do { \ uintptr_t _r_a_p__v = (uintptr_t)(v); \ rcu_check_sparse(p, __rcu); \ \ if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \ WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ else \ smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \ } while (0) /** * rcu_replace_pointer() - replace an RCU pointer, returning its old value * @rcu_ptr: RCU pointer, whose old value is returned * @ptr: regular pointer * @c: the lockdep conditions under which the dereference will take place * * Perform a replacement, where @rcu_ptr is an RCU-annotated * pointer and @c is the lockdep argument that is passed to the * rcu_dereference_protected() call used to read that pointer. The old * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr. */ #define rcu_replace_pointer(rcu_ptr, ptr, c) \ ({ \ typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c)); \ rcu_assign_pointer((rcu_ptr), (ptr)); \ __tmp; \ }) /** * rcu_access_pointer() - fetch RCU pointer with no dereferencing * @p: The pointer to read * * Return the value of the specified RCU-protected pointer, but omit the * lockdep checks for being in an RCU read-side critical section. This is * useful when the value of this pointer is accessed, but the pointer is * not dereferenced, for example, when testing an RCU-protected pointer * against NULL. Although rcu_access_pointer() may also be used in cases * where update-side locks prevent the value of the pointer from changing, * you should instead use rcu_dereference_protected() for this use case. * Within an RCU read-side critical section, there is little reason to * use rcu_access_pointer(). * * It is usually best to test the rcu_access_pointer() return value * directly in order to avoid accidental dereferences being introduced * by later inattentive changes. In other words, assigning the * rcu_access_pointer() return value to a local variable results in an * accident waiting to happen. * * It is also permissible to use rcu_access_pointer() when read-side * access to the pointer was removed at least one grace period ago, as is * the case in the context of the RCU callback that is freeing up the data, * or after a synchronize_rcu() returns. This can be useful when tearing * down multi-linked structures after a grace period has elapsed. However, * rcu_dereference_protected() is normally preferred for this use case. */ #define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu) /** * rcu_dereference_check() - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Do an rcu_dereference(), but check that the conditions under which the * dereference will take place are correct. Typically the conditions * indicate the various locking conditions that should be held at that * point. The check should return true if the conditions are satisfied. * An implicit check for being in an RCU read-side critical section * (rcu_read_lock()) is included. * * For example: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock)); * * could be used to indicate to lockdep that foo->bar may only be dereferenced * if either rcu_read_lock() is held, or that the lock required to replace * the bar struct at foo->bar is held. * * Note that the list of conditions may also include indications of when a lock * need not be held, for example during initialisation or destruction of the * target struct: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) || * atomic_read(&foo->usage) == 0); * * Inserts memory barriers on architectures that require them * (currently only the Alpha), prevents the compiler from refetching * (and from merging fetches), and, more importantly, documents exactly * which pointers are protected by RCU and checks that the pointer is * annotated as __rcu. */ #define rcu_dereference_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_held(), __rcu) /** * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-bh counterpart to rcu_dereference_check(). However, * please note that starting in v5.0 kernels, vanilla RCU grace periods * wait for local_bh_disable() regions of code in addition to regions of * code demarked by rcu_read_lock() and rcu_read_unlock(). This means * that synchronize_rcu(), call_rcu, and friends all take not only * rcu_read_lock() but also rcu_read_lock_bh() into account. */ #define rcu_dereference_bh_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_bh_held(), __rcu) /** * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-sched counterpart to rcu_dereference_check(). * However, please note that starting in v5.0 kernels, vanilla RCU grace * periods wait for preempt_disable() regions of code in addition to * regions of code demarked by rcu_read_lock() and rcu_read_unlock(). * This means that synchronize_rcu(), call_rcu, and friends all take not * only rcu_read_lock() but also rcu_read_lock_sched() into account. */ #define rcu_dereference_sched_check(p, c) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), \ (c) || rcu_read_lock_sched_held(), \ __rcu) /* * The tracing infrastructure traces RCU (we want that), but unfortunately * some of the RCU checks causes tracing to lock up the system. * * The no-tracing version of rcu_dereference_raw() must not call * rcu_read_lock_held(). */ #define rcu_dereference_raw_check(p) \ __rcu_dereference_check((p), __UNIQUE_ID(rcu), 1, __rcu) /** * rcu_dereference_protected() - fetch RCU pointer when updates prevented * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Return the value of the specified RCU-protected pointer, but omit * the READ_ONCE(). This is useful in cases where update-side locks * prevent the value of the pointer from changing. Please note that this * primitive does *not* prevent the compiler from repeating this reference * or combining it with other references, so it should not be used without * protection of appropriate locks. * * This function is only for update-side use. Using this function * when protected only by rcu_read_lock() will result in infrequent * but very ugly failures. */ #define rcu_dereference_protected(p, c) \ __rcu_dereference_protected((p), __UNIQUE_ID(rcu), (c), __rcu) /** * rcu_dereference() - fetch RCU-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * This is a simple wrapper around rcu_dereference_check(). */ #define rcu_dereference(p) rcu_dereference_check(p, 0) /** * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0) /** * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0) /** * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism * @p: The pointer to hand off * * This is simply an identity function, but it documents where a pointer * is handed off from RCU to some other synchronization mechanism, for * example, reference counting or locking. In C11, it would map to * kill_dependency(). It could be used as follows:: * * rcu_read_lock(); * p = rcu_dereference(gp); * long_lived = is_long_lived(p); * if (long_lived) { * if (!atomic_inc_not_zero(p->refcnt)) * long_lived = false; * else * p = rcu_pointer_handoff(p); * } * rcu_read_unlock(); */ #define rcu_pointer_handoff(p) (p) /** * rcu_read_lock() - mark the beginning of an RCU read-side critical section * * When synchronize_rcu() is invoked on one CPU while other CPUs * are within RCU read-side critical sections, then the * synchronize_rcu() is guaranteed to block until after all the other * CPUs exit their critical sections. Similarly, if call_rcu() is invoked * on one CPU while other CPUs are within RCU read-side critical * sections, invocation of the corresponding RCU callback is deferred * until after the all the other CPUs exit their critical sections. * * In v5.0 and later kernels, synchronize_rcu() and call_rcu() also * wait for regions of code with preemption disabled, including regions of * code with interrupts or softirqs disabled. In pre-v5.0 kernels, which * define synchronize_sched(), only code enclosed within rcu_read_lock() * and rcu_read_unlock() are guaranteed to be waited for. * * Note, however, that RCU callbacks are permitted to run concurrently * with new RCU read-side critical sections. One way that this can happen * is via the following sequence of events: (1) CPU 0 enters an RCU * read-side critical section, (2) CPU 1 invokes call_rcu() to register * an RCU callback, (3) CPU 0 exits the RCU read-side critical section, * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU * callback is invoked. This is legal, because the RCU read-side critical * section that was running concurrently with the call_rcu() (and which * therefore might be referencing something that the corresponding RCU * callback would free up) has completed before the corresponding * RCU callback is invoked. * * RCU read-side critical sections may be nested. Any deferred actions * will be deferred until the outermost RCU read-side critical section * completes. * * You can avoid reading and understanding the next paragraph by * following this rule: don't put anything in an rcu_read_lock() RCU * read-side critical section that would block in a !PREEMPTION kernel. * But if you want the full story, read on! * * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU), * it is illegal to block while in an RCU read-side critical section. * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION * kernel builds, RCU read-side critical sections may be preempted, * but explicit blocking is illegal. Finally, in preemptible RCU * implementations in real-time (with -rt patchset) kernel builds, RCU * read-side critical sections may be preempted and they may also block, but * only when acquiring spinlocks that are subject to priority inheritance. */ static __always_inline void rcu_read_lock(void) { __rcu_read_lock(); __acquire(RCU); rcu_lock_acquire(&rcu_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock() used illegally while idle"); } /* * So where is rcu_write_lock()? It does not exist, as there is no * way for writers to lock out RCU readers. This is a feature, not * a bug -- this property is what provides RCU's performance benefits. * Of course, writers must coordinate with each other. The normal * spinlock primitives work well for this, but any other technique may be * used as well. RCU does not care how the writers keep out of each * others' way, as long as they do so. */ /** * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * In almost all situations, rcu_read_unlock() is immune from deadlock. * In recent kernels that have consolidated synchronize_sched() and * synchronize_rcu_bh() into synchronize_rcu(), this deadlock immunity * also extends to the scheduler's runqueue and priority-inheritance * spinlocks, courtesy of the quiescent-state deferral that is carried * out when rcu_read_unlock() is invoked with interrupts disabled. * * See rcu_read_lock() for more information. */ static inline void rcu_read_unlock(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock() used illegally while idle"); rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */ __release(RCU); __rcu_read_unlock(); } /** * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section * * This is equivalent to rcu_read_lock(), but also disables softirqs. * Note that anything else that disables softirqs can also serve as an RCU * read-side critical section. However, please note that this equivalence * applies only to v5.0 and later. Before v5.0, rcu_read_lock() and * rcu_read_lock_bh() were unrelated. * * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh() * was invoked from some other task. */ static inline void rcu_read_lock_bh(void) { local_bh_disable(); __acquire(RCU_BH); rcu_lock_acquire(&rcu_bh_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_bh() used illegally while idle"); } /** * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section * * See rcu_read_lock_bh() for more information. */ static inline void rcu_read_unlock_bh(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_bh() used illegally while idle"); rcu_lock_release(&rcu_bh_lock_map); __release(RCU_BH); local_bh_enable(); } /** * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section * * This is equivalent to rcu_read_lock(), but also disables preemption. * Read-side critical sections can also be introduced by anything else that * disables preemption, including local_irq_disable() and friends. However, * please note that the equivalence to rcu_read_lock() applies only to * v5.0 and later. Before v5.0, rcu_read_lock() and rcu_read_lock_sched() * were unrelated. * * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_sched() from process context if the matching * rcu_read_lock_sched() was invoked from an NMI handler. */ static inline void rcu_read_lock_sched(void) { preempt_disable(); __acquire(RCU_SCHED); rcu_lock_acquire(&rcu_sched_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_sched() used illegally while idle"); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_lock_sched_notrace(void) { preempt_disable_notrace(); __acquire(RCU_SCHED); } /** * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section * * See rcu_read_lock_sched() for more information. */ static inline void rcu_read_unlock_sched(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_sched() used illegally while idle"); rcu_lock_release(&rcu_sched_lock_map); __release(RCU_SCHED); preempt_enable(); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_unlock_sched_notrace(void) { __release(RCU_SCHED); preempt_enable_notrace(); } /** * RCU_INIT_POINTER() - initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * Initialize an RCU-protected pointer in special cases where readers * do not need ordering constraints on the CPU or the compiler. These * special cases are: * * 1. This use of RCU_INIT_POINTER() is NULLing out the pointer *or* * 2. The caller has taken whatever steps are required to prevent * RCU readers from concurrently accessing this pointer *or* * 3. The referenced data structure has already been exposed to * readers either at compile time or via rcu_assign_pointer() *and* * * a. You have not made *any* reader-visible changes to * this structure since then *or* * b. It is OK for readers accessing this structure from its * new location to see the old state of the structure. (For * example, the changes were to statistical counters or to * other state where exact synchronization is not required.) * * Failure to follow these rules governing use of RCU_INIT_POINTER() will * result in impossible-to-diagnose memory corruption. As in the structures * will look OK in crash dumps, but any concurrent RCU readers might * see pre-initialized values of the referenced data structure. So * please be very careful how you use RCU_INIT_POINTER()!!! * * If you are creating an RCU-protected linked structure that is accessed * by a single external-to-structure RCU-protected pointer, then you may * use RCU_INIT_POINTER() to initialize the internal RCU-protected * pointers, but you must use rcu_assign_pointer() to initialize the * external-to-structure pointer *after* you have completely initialized * the reader-accessible portions of the linked structure. * * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no * ordering guarantees for either the CPU or the compiler. */ #define RCU_INIT_POINTER(p, v) \ do { \ rcu_check_sparse(p, __rcu); \ WRITE_ONCE(p, RCU_INITIALIZER(v)); \ } while (0) /** * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * GCC-style initialization for an RCU-protected pointer in a structure field. */ #define RCU_POINTER_INITIALIZER(p, v) \ .p = RCU_INITIALIZER(v) /* * Does the specified offset indicate that the corresponding rcu_head * structure can be handled by kvfree_rcu()? */ #define __is_kvfree_rcu_offset(offset) ((offset) < 4096) /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree for double-argument invocations. * @rhf: the name of the struct rcu_head within the type of @ptr. * * Many rcu callbacks functions just call kfree() on the base structure. * These functions are trivial, but their size adds up, and furthermore * when they are used in a kernel module, that module must invoke the * high-latency rcu_barrier() function at module-unload time. * * The kfree_rcu() function handles this issue. Rather than encoding a * function address in the embedded rcu_head structure, kfree_rcu() instead * encodes the offset of the rcu_head structure within the base structure. * Because the functions are not allowed in the low-order 4096 bytes of * kernel virtual memory, offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to * position the rcu_head structure into the first 4096 bytes. * * The object to be freed can be allocated either by kmalloc() or * kmem_cache_alloc(). * * Note that the allowable offset might decrease in the future. * * The BUILD_BUG_ON check must not involve any function calls, hence the * checks are done in macros here. */ #define kfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) #define kvfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf) /** * kfree_rcu_mightsleep() - kfree an object after a grace period. * @ptr: pointer to kfree for single-argument invocations. * * When it comes to head-less variant, only one argument * is passed and that is just a pointer which has to be * freed after a grace period. Therefore the semantic is * * kfree_rcu_mightsleep(ptr); * * where @ptr is the pointer to be freed by kvfree(). * * Please note, head-less way of freeing is permitted to * use from a context that has to follow might_sleep() * annotation. Otherwise, please switch and embed the * rcu_head structure within the type of @ptr. */ #define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ \ if (___p) { \ BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf))); \ kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ } \ } while (0) #define kvfree_rcu_arg_1(ptr) \ do { \ typeof(ptr) ___p = (ptr); \ \ if (___p) \ kvfree_call_rcu(NULL, (void *) (___p)); \ } while (0) /* * Place this after a lock-acquisition primitive to guarantee that * an UNLOCK+LOCK pair acts as a full barrier. This guarantee applies * if the UNLOCK and LOCK are executed by the same CPU or if the * UNLOCK and LOCK operate on the same lock variable. */ #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ #else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ /* Has the specified rcu_head structure been handed to call_rcu()? */ /** * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu() * @rhp: The rcu_head structure to initialize. * * If you intend to invoke rcu_head_after_call_rcu() to test whether a * given rcu_head structure has already been passed to call_rcu(), then * you must also invoke this rcu_head_init() function on it just after * allocating that structure. Calls to this function must not race with * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation. */ static inline void rcu_head_init(struct rcu_head *rhp) { rhp->func = (rcu_callback_t)~0L; } /** * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()? * @rhp: The rcu_head structure to test. * @f: The function passed to call_rcu() along with @rhp. * * Returns @true if the @rhp has been passed to call_rcu() with @func, * and @false otherwise. Emits a warning in any other case, including * the case where @rhp has already been invoked after a grace period. * Calls to this function must not race with callback invocation. One way * to avoid such races is to enclose the call to rcu_head_after_call_rcu() * in an RCU read-side critical section that includes a read-side fetch * of the pointer to the structure containing @rhp. */ static inline bool rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f) { rcu_callback_t func = READ_ONCE(rhp->func); if (func == f) return true; WARN_ON_ONCE(func != (rcu_callback_t)~0L); return false; } /* kernel/ksysfs.c definitions */ extern int rcu_expedited; extern int rcu_normal; DEFINE_LOCK_GUARD_0(rcu, do { rcu_read_lock(); /* * sparse doesn't call the cleanup function, * so just release immediately and don't track * the context. We don't need to anyway, since * the whole point of the guard is to not need * the explicit unlock. */ __release(RCU); } while (0), rcu_read_unlock()) #endif /* __LINUX_RCUPDATE_H */
4 1 2 1 3 3 3 3 12 9 9 3 12 12 3 8 12 12 3 9 3 3 12 12 2 2 2 2 34 28 1 4 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2019 HUAWEI, Inc. * https://www.huawei.com/ * Copyright (C) 2024 Alibaba Cloud */ #include "compress.h" #include <linux/lz4.h> #ifndef LZ4_DISTANCE_MAX /* history window size */ #define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ #endif #define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1) #ifndef LZ4_DECOMPRESS_INPLACE_MARGIN #define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) #endif struct z_erofs_lz4_decompress_ctx { struct z_erofs_decompress_req *rq; /* # of encoded, decoded pages */ unsigned int inpages, outpages; /* decoded block total length (used for in-place decompression) */ unsigned int oend; }; static int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, void *data, int size) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct z_erofs_lz4_cfgs *lz4 = data; u16 distance; if (lz4) { if (size < sizeof(struct z_erofs_lz4_cfgs)) { erofs_err(sb, "invalid lz4 cfgs, size=%u", size); return -EINVAL; } distance = le16_to_cpu(lz4->max_distance); sbi->lz4.max_pclusterblks = le16_to_cpu(lz4->max_pclusterblks); if (!sbi->lz4.max_pclusterblks) { sbi->lz4.max_pclusterblks = 1; /* reserved case */ } else if (sbi->lz4.max_pclusterblks > erofs_blknr(sb, Z_EROFS_PCLUSTER_MAX_SIZE)) { erofs_err(sb, "too large lz4 pclusterblks %u", sbi->lz4.max_pclusterblks); return -EINVAL; } } else { distance = le16_to_cpu(dsb->u1.lz4_max_distance); sbi->lz4.max_pclusterblks = 1; } sbi->lz4.max_distance_pages = distance ? DIV_ROUND_UP(distance, PAGE_SIZE) + 1 : LZ4_MAX_DISTANCE_PAGES; return z_erofs_gbuf_growsize(sbi->lz4.max_pclusterblks); } /* * Fill all gaps with bounce pages if it's a sparse page list. Also check if * all physical pages are consecutive, which can be seen for moderate CR. */ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, struct page **pagepool) { struct z_erofs_decompress_req *rq = ctx->rq; struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, BITS_PER_LONG)] = { 0 }; unsigned int lz4_max_distance_pages = EROFS_SB(rq->sb)->lz4.max_distance_pages; void *kaddr = NULL; unsigned int i, j, top; top = 0; for (i = j = 0; i < ctx->outpages; ++i, ++j) { struct page *const page = rq->out[i]; struct page *victim; if (j >= lz4_max_distance_pages) j = 0; /* 'valid' bounced can only be tested after a complete round */ if (!rq->fillgaps && test_bit(j, bounced)) { DBG_BUGON(i < lz4_max_distance_pages); DBG_BUGON(top >= lz4_max_distance_pages); availables[top++] = rq->out[i - lz4_max_distance_pages]; } if (page) { __clear_bit(j, bounced); if (!PageHighMem(page)) { if (!i) { kaddr = page_address(page); continue; } if (kaddr && kaddr + PAGE_SIZE == page_address(page)) { kaddr += PAGE_SIZE; continue; } } kaddr = NULL; continue; } kaddr = NULL; __set_bit(j, bounced); if (top) { victim = availables[--top]; } else { victim = __erofs_allocpage(pagepool, rq->gfp, true); if (!victim) return -ENOMEM; set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); } rq->out[i] = victim; } return kaddr ? 1 : 0; } static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, void *inpage, void *out, unsigned int *inputmargin, int *maptype, bool may_inplace) { struct z_erofs_decompress_req *rq = ctx->rq; unsigned int omargin, total, i; struct page **in; void *src, *tmp; if (rq->inplace_io) { omargin = PAGE_ALIGN(ctx->oend) - ctx->oend; if (rq->partial_decoding || !may_inplace || omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) goto docopy; for (i = 0; i < ctx->inpages; ++i) if (rq->out[ctx->outpages - ctx->inpages + i] != rq->in[i]) goto docopy; kunmap_local(inpage); *maptype = 3; return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT); } if (ctx->inpages <= 1) { *maptype = 0; return inpage; } kunmap_local(inpage); src = erofs_vm_map_ram(rq->in, ctx->inpages); if (!src) return ERR_PTR(-ENOMEM); *maptype = 1; return src; docopy: /* Or copy compressed data which can be overlapped to per-CPU buffer */ in = rq->in; src = z_erofs_get_gbuf(ctx->inpages); if (!src) { DBG_BUGON(1); kunmap_local(inpage); return ERR_PTR(-EFAULT); } tmp = src; total = rq->inputsize; while (total) { unsigned int page_copycnt = min_t(unsigned int, total, PAGE_SIZE - *inputmargin); if (!inpage) inpage = kmap_local_page(*in); memcpy(tmp, inpage + *inputmargin, page_copycnt); kunmap_local(inpage); inpage = NULL; tmp += page_copycnt; total -= page_copycnt; ++in; *inputmargin = 0; } *maptype = 2; return src; } /* * Get the exact inputsize with zero_padding feature. * - For LZ4, it should work if zero_padding feature is on (5.3+); * - For MicroLZMA, it'd be enabled all the time. */ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, unsigned int padbufsize) { const char *padend; padend = memchr_inv(padbuf, 0, padbufsize); if (!padend) return -EFSCORRUPTED; rq->inputsize -= padend - padbuf; rq->pageofs_in += padend - padbuf; return 0; } static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, u8 *dst) { struct z_erofs_decompress_req *rq = ctx->rq; bool support_0padding = false, may_inplace = false; unsigned int inputmargin; u8 *out, *headpage, *src; int ret, maptype; DBG_BUGON(*rq->in == NULL); headpage = kmap_local_page(*rq->in); /* LZ4 decompression inplace is only safe if zero_padding is enabled */ if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) { support_0padding = true; ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in, min_t(unsigned int, rq->inputsize, rq->sb->s_blocksize - rq->pageofs_in)); if (ret) { kunmap_local(headpage); return ret; } may_inplace = !((rq->pageofs_in + rq->inputsize) & (rq->sb->s_blocksize - 1)); } inputmargin = rq->pageofs_in; src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin, &maptype, may_inplace); if (IS_ERR(src)) return PTR_ERR(src); out = dst + rq->pageofs_out; /* legacy format could compress extra data in a pcluster. */ if (rq->partial_decoding || !support_0padding) ret = LZ4_decompress_safe_partial(src + inputmargin, out, rq->inputsize, rq->outputsize, rq->outputsize); else ret = LZ4_decompress_safe(src + inputmargin, out, rq->inputsize, rq->outputsize); if (ret != rq->outputsize) { erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]", ret, rq->inputsize, inputmargin, rq->outputsize); if (ret >= 0) memset(out + ret, 0, rq->outputsize - ret); ret = -EFSCORRUPTED; } else { ret = 0; } if (maptype == 0) { kunmap_local(headpage); } else if (maptype == 1) { vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { z_erofs_put_gbuf(src); } else if (maptype != 3) { DBG_BUGON(1); return -EFAULT; } return ret; } static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool) { struct z_erofs_lz4_decompress_ctx ctx; unsigned int dst_maptype; void *dst; int ret; ctx.rq = rq; ctx.oend = rq->pageofs_out + rq->outputsize; ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT; ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; /* one optimized fast path only for non bigpcluster cases yet */ if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { DBG_BUGON(!*rq->out); dst = kmap_local_page(*rq->out); dst_maptype = 0; goto dstmap_out; } /* general decoding path which can be used for all cases */ ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool); if (ret < 0) { return ret; } else if (ret > 0) { dst = page_address(*rq->out); dst_maptype = 1; } else { dst = erofs_vm_map_ram(rq->out, ctx.outpages); if (!dst) return -ENOMEM; dst_maptype = 2; } dstmap_out: ret = z_erofs_lz4_decompress_mem(&ctx, dst); if (!dst_maptype) kunmap_local(dst); else if (dst_maptype == 2) vm_unmap_ram(dst, ctx.outpages); return ret; } static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, struct page **pagepool) { const unsigned int nrpages_in = PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT; const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; const unsigned int bs = rq->sb->s_blocksize; unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt; u8 *kin; if (rq->outputsize > rq->inputsize) return -EOPNOTSUPP; if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) { cur = bs - (rq->pageofs_out & (bs - 1)); pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK; cur = min(cur, rq->outputsize); if (cur && rq->out[0]) { kin = kmap_local_page(rq->in[nrpages_in - 1]); if (rq->out[0] == rq->in[nrpages_in - 1]) { memmove(kin + rq->pageofs_out, kin + pi, cur); flush_dcache_page(rq->out[0]); } else { memcpy_to_page(rq->out[0], rq->pageofs_out, kin + pi, cur); } kunmap_local(kin); } rq->outputsize -= cur; } for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) { insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize); rq->outputsize -= insz; if (!rq->in[ni]) continue; kin = kmap_local_page(rq->in[ni]); pi = 0; do { no = (rq->pageofs_out + cur + pi) >> PAGE_SHIFT; po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK; DBG_BUGON(no >= nrpages_out); cnt = min(insz - pi, PAGE_SIZE - po); if (rq->out[no] == rq->in[ni]) { memmove(kin + po, kin + rq->pageofs_in + pi, cnt); flush_dcache_page(rq->out[no]); } else if (rq->out[no]) { memcpy_to_page(rq->out[no], po, kin + rq->pageofs_in + pi, cnt); } pi += cnt; } while (pi < insz); kunmap_local(kin); } DBG_BUGON(ni > nrpages_in); return 0; } int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst, void **src, struct page **pgpl) { struct z_erofs_decompress_req *rq = dctx->rq; struct super_block *sb = rq->sb; struct page **pgo, *tmppage; unsigned int j; if (!dctx->avail_out) { if (++dctx->no >= dctx->outpages || !rq->outputsize) { erofs_err(sb, "insufficient space for decompressed data"); return -EFSCORRUPTED; } if (dctx->kout) kunmap_local(dctx->kout); dctx->avail_out = min(rq->outputsize, PAGE_SIZE - rq->pageofs_out); rq->outputsize -= dctx->avail_out; pgo = &rq->out[dctx->no]; if (!*pgo && rq->fillgaps) { /* deduped */ *pgo = erofs_allocpage(pgpl, rq->gfp); if (!*pgo) { dctx->kout = NULL; return -ENOMEM; } set_page_private(*pgo, Z_EROFS_SHORTLIVED_PAGE); } if (*pgo) { dctx->kout = kmap_local_page(*pgo); *dst = dctx->kout + rq->pageofs_out; } else { *dst = dctx->kout = NULL; } rq->pageofs_out = 0; } if (dctx->inbuf_pos == dctx->inbuf_sz && rq->inputsize) { if (++dctx->ni >= dctx->inpages) { erofs_err(sb, "invalid compressed data"); return -EFSCORRUPTED; } if (dctx->kout) /* unlike kmap(), take care of the orders */ kunmap_local(dctx->kout); kunmap_local(dctx->kin); dctx->inbuf_sz = min_t(u32, rq->inputsize, PAGE_SIZE); rq->inputsize -= dctx->inbuf_sz; dctx->kin = kmap_local_page(rq->in[dctx->ni]); *src = dctx->kin; dctx->bounced = false; if (dctx->kout) { j = (u8 *)*dst - dctx->kout; dctx->kout = kmap_local_page(rq->out[dctx->no]); *dst = dctx->kout + j; } dctx->inbuf_pos = 0; } /* * Handle overlapping: Use the given bounce buffer if the input data is * under processing; Or utilize short-lived pages from the on-stack page * pool, where pages are shared among the same request. Note that only * a few inplace I/O pages need to be doubled. */ if (!dctx->bounced && rq->out[dctx->no] == rq->in[dctx->ni]) { memcpy(dctx->bounce, *src, dctx->inbuf_sz); *src = dctx->bounce; dctx->bounced = true; } for (j = dctx->ni + 1; j < dctx->inpages; ++j) { if (rq->out[dctx->no] != rq->in[j]) continue; tmppage = erofs_allocpage(pgpl, rq->gfp); if (!tmppage) return -ENOMEM; set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); copy_highpage(tmppage, rq->in[j]); rq->in[j] = tmppage; } return 0; } const struct z_erofs_decompressor *z_erofs_decomp[] = { [Z_EROFS_COMPRESSION_SHIFTED] = &(const struct z_erofs_decompressor) { .decompress = z_erofs_transform_plain, .name = "shifted" }, [Z_EROFS_COMPRESSION_INTERLACED] = &(const struct z_erofs_decompressor) { .decompress = z_erofs_transform_plain, .name = "interlaced" }, [Z_EROFS_COMPRESSION_LZ4] = &(const struct z_erofs_decompressor) { .config = z_erofs_load_lz4_config, .decompress = z_erofs_lz4_decompress, .init = z_erofs_gbuf_init, .exit = z_erofs_gbuf_exit, .name = "lz4" }, #ifdef CONFIG_EROFS_FS_ZIP_LZMA [Z_EROFS_COMPRESSION_LZMA] = &z_erofs_lzma_decomp, #endif #ifdef CONFIG_EROFS_FS_ZIP_DEFLATE [Z_EROFS_COMPRESSION_DEFLATE] = &z_erofs_deflate_decomp, #endif #ifdef CONFIG_EROFS_FS_ZIP_ZSTD [Z_EROFS_COMPRESSION_ZSTD] = &z_erofs_zstd_decomp, #endif }; int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; unsigned int algs, alg; erofs_off_t offset; int size, ret = 0; if (!erofs_sb_has_compr_cfgs(sbi)) { sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4; return z_erofs_load_lz4_config(sb, dsb, NULL, 0); } sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs); if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) { erofs_err(sb, "unidentified algorithms %x, please upgrade kernel", sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS); return -EOPNOTSUPP; } erofs_init_metabuf(&buf, sb); offset = EROFS_SUPER_OFFSET + sbi->sb_size; alg = 0; for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { const struct z_erofs_decompressor *dec = z_erofs_decomp[alg]; void *data; if (!(algs & 1)) continue; data = erofs_read_metadata(sb, &buf, &offset, &size); if (IS_ERR(data)) { ret = PTR_ERR(data); break; } if (alg < Z_EROFS_COMPRESSION_MAX && dec && dec->config) { ret = dec->config(sb, dsb, data, size); } else { erofs_err(sb, "algorithm %d isn't enabled on this kernel", alg); ret = -EOPNOTSUPP; } kfree(data); if (ret) break; } erofs_put_metabuf(&buf); return ret; } int __init z_erofs_init_decompressor(void) { int i, err; for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) { err = z_erofs_decomp[i] ? z_erofs_decomp[i]->init() : 0; if (err) { while (i--) if (z_erofs_decomp[i]) z_erofs_decomp[i]->exit(); return err; } } return 0; } void z_erofs_exit_decompressor(void) { int i; for (i = 0; i < Z_EROFS_COMPRESSION_MAX; ++i) if (z_erofs_decomp[i]) z_erofs_decomp[i]->exit(); }
12 12 6 13 6 13 13 13 13 13 13 19 19 1 3 3 1 4 3 2 9 8 3 3 10 10 10 12 12 11 2 8 1 10 10 10 28 2 1 13 5 6 1 12 7 5 12 10 2 11 11 10 10 1 10 10 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2018 Protonic, // Robin van der Gracht <robin@protonic.nl> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> #include <linux/can/skb.h> #include "j1939-priv.h" #define J1939_XTP_TX_RETRY_LIMIT 100 #define J1939_ETP_PGN_CTL 0xc800 #define J1939_ETP_PGN_DAT 0xc700 #define J1939_TP_PGN_CTL 0xec00 #define J1939_TP_PGN_DAT 0xeb00 #define J1939_TP_CMD_RTS 0x10 #define J1939_TP_CMD_CTS 0x11 #define J1939_TP_CMD_EOMA 0x13 #define J1939_TP_CMD_BAM 0x20 #define J1939_TP_CMD_ABORT 0xff #define J1939_ETP_CMD_RTS 0x14 #define J1939_ETP_CMD_CTS 0x15 #define J1939_ETP_CMD_DPO 0x16 #define J1939_ETP_CMD_EOMA 0x17 #define J1939_ETP_CMD_ABORT 0xff enum j1939_xtp_abort { J1939_XTP_NO_ABORT = 0, J1939_XTP_ABORT_BUSY = 1, /* Already in one or more connection managed sessions and * cannot support another. * * EALREADY: * Operation already in progress */ J1939_XTP_ABORT_RESOURCE = 2, /* System resources were needed for another task so this * connection managed session was terminated. * * EMSGSIZE: * The socket type requires that message be sent atomically, * and the size of the message to be sent made this * impossible. */ J1939_XTP_ABORT_TIMEOUT = 3, /* A timeout occurred and this is the connection abort to * close the session. * * EHOSTUNREACH: * The destination host cannot be reached (probably because * the host is down or a remote router cannot reach it). */ J1939_XTP_ABORT_GENERIC = 4, /* CTS messages received when data transfer is in progress * * EBADMSG: * Not a data message */ J1939_XTP_ABORT_FAULT = 5, /* Maximal retransmit request limit reached * * ENOTRECOVERABLE: * State not recoverable */ J1939_XTP_ABORT_UNEXPECTED_DATA = 6, /* Unexpected data transfer packet * * ENOTCONN: * Transport endpoint is not connected */ J1939_XTP_ABORT_BAD_SEQ = 7, /* Bad sequence number (and software is not able to recover) * * EILSEQ: * Illegal byte sequence */ J1939_XTP_ABORT_DUP_SEQ = 8, /* Duplicate sequence number (and software is not able to * recover) */ J1939_XTP_ABORT_EDPO_UNEXPECTED = 9, /* Unexpected EDPO packet (ETP) or Message size > 1785 bytes * (TP) */ J1939_XTP_ABORT_BAD_EDPO_PGN = 10, /* Unexpected EDPO PGN (PGN in EDPO is bad) */ J1939_XTP_ABORT_EDPO_OUTOF_CTS = 11, /* EDPO number of packets is greater than CTS */ J1939_XTP_ABORT_BAD_EDPO_OFFSET = 12, /* Bad EDPO offset */ J1939_XTP_ABORT_OTHER_DEPRECATED = 13, /* Deprecated. Use 250 instead (Any other reason) */ J1939_XTP_ABORT_ECTS_UNXPECTED_PGN = 14, /* Unexpected ECTS PGN (PGN in ECTS is bad) */ J1939_XTP_ABORT_ECTS_TOO_BIG = 15, /* ECTS requested packets exceeds message size */ J1939_XTP_ABORT_OTHER = 250, /* Any other reason (if a Connection Abort reason is * identified that is not listed in the table use code 250) */ }; static unsigned int j1939_tp_block = 255; static unsigned int j1939_tp_packet_delay; static unsigned int j1939_tp_padding = 1; /* helpers */ static const char *j1939_xtp_abort_to_str(enum j1939_xtp_abort abort) { switch (abort) { case J1939_XTP_ABORT_BUSY: return "Already in one or more connection managed sessions and cannot support another."; case J1939_XTP_ABORT_RESOURCE: return "System resources were needed for another task so this connection managed session was terminated."; case J1939_XTP_ABORT_TIMEOUT: return "A timeout occurred and this is the connection abort to close the session."; case J1939_XTP_ABORT_GENERIC: return "CTS messages received when data transfer is in progress"; case J1939_XTP_ABORT_FAULT: return "Maximal retransmit request limit reached"; case J1939_XTP_ABORT_UNEXPECTED_DATA: return "Unexpected data transfer packet"; case J1939_XTP_ABORT_BAD_SEQ: return "Bad sequence number (and software is not able to recover)"; case J1939_XTP_ABORT_DUP_SEQ: return "Duplicate sequence number (and software is not able to recover)"; case J1939_XTP_ABORT_EDPO_UNEXPECTED: return "Unexpected EDPO packet (ETP) or Message size > 1785 bytes (TP)"; case J1939_XTP_ABORT_BAD_EDPO_PGN: return "Unexpected EDPO PGN (PGN in EDPO is bad)"; case J1939_XTP_ABORT_EDPO_OUTOF_CTS: return "EDPO number of packets is greater than CTS"; case J1939_XTP_ABORT_BAD_EDPO_OFFSET: return "Bad EDPO offset"; case J1939_XTP_ABORT_OTHER_DEPRECATED: return "Deprecated. Use 250 instead (Any other reason)"; case J1939_XTP_ABORT_ECTS_UNXPECTED_PGN: return "Unexpected ECTS PGN (PGN in ECTS is bad)"; case J1939_XTP_ABORT_ECTS_TOO_BIG: return "ECTS requested packets exceeds message size"; case J1939_XTP_ABORT_OTHER: return "Any other reason (if a Connection Abort reason is identified that is not listed in the table use code 250)"; default: return "<unknown>"; } } static int j1939_xtp_abort_to_errno(struct j1939_priv *priv, enum j1939_xtp_abort abort) { int err; switch (abort) { case J1939_XTP_NO_ABORT: WARN_ON_ONCE(abort == J1939_XTP_NO_ABORT); err = 0; break; case J1939_XTP_ABORT_BUSY: err = EALREADY; break; case J1939_XTP_ABORT_RESOURCE: err = EMSGSIZE; break; case J1939_XTP_ABORT_TIMEOUT: err = EHOSTUNREACH; break; case J1939_XTP_ABORT_GENERIC: err = EBADMSG; break; case J1939_XTP_ABORT_FAULT: err = ENOTRECOVERABLE; break; case J1939_XTP_ABORT_UNEXPECTED_DATA: err = ENOTCONN; break; case J1939_XTP_ABORT_BAD_SEQ: err = EILSEQ; break; case J1939_XTP_ABORT_DUP_SEQ: err = EPROTO; break; case J1939_XTP_ABORT_EDPO_UNEXPECTED: err = EPROTO; break; case J1939_XTP_ABORT_BAD_EDPO_PGN: err = EPROTO; break; case J1939_XTP_ABORT_EDPO_OUTOF_CTS: err = EPROTO; break; case J1939_XTP_ABORT_BAD_EDPO_OFFSET: err = EPROTO; break; case J1939_XTP_ABORT_OTHER_DEPRECATED: err = EPROTO; break; case J1939_XTP_ABORT_ECTS_UNXPECTED_PGN: err = EPROTO; break; case J1939_XTP_ABORT_ECTS_TOO_BIG: err = EPROTO; break; case J1939_XTP_ABORT_OTHER: err = EPROTO; break; default: netdev_warn(priv->ndev, "Unknown abort code %i", abort); err = EPROTO; } return err; } static inline void j1939_session_list_lock(struct j1939_priv *priv) { spin_lock_bh(&priv->active_session_list_lock); } static inline void j1939_session_list_unlock(struct j1939_priv *priv) { spin_unlock_bh(&priv->active_session_list_lock); } void j1939_session_get(struct j1939_session *session) { kref_get(&session->kref); } /* session completion functions */ static void __j1939_session_drop(struct j1939_session *session) { if (!session->transmission) return; j1939_sock_pending_del(session->sk); sock_put(session->sk); } static void j1939_session_destroy(struct j1939_session *session) { struct sk_buff *skb; if (session->transmission) { if (session->err) j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ABORT); else j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ACK); } else if (session->err) { j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT); } netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); WARN_ON_ONCE(!list_empty(&session->sk_session_queue_entry)); WARN_ON_ONCE(!list_empty(&session->active_session_list_entry)); while ((skb = skb_dequeue(&session->skb_queue)) != NULL) { /* drop ref taken in j1939_session_skb_queue() */ skb_unref(skb); kfree_skb(skb); } __j1939_session_drop(session); j1939_priv_put(session->priv); kfree(session); } static void __j1939_session_release(struct kref *kref) { struct j1939_session *session = container_of(kref, struct j1939_session, kref); j1939_session_destroy(session); } void j1939_session_put(struct j1939_session *session) { kref_put(&session->kref, __j1939_session_release); } static void j1939_session_txtimer_cancel(struct j1939_session *session) { if (hrtimer_cancel(&session->txtimer)) j1939_session_put(session); } static void j1939_session_rxtimer_cancel(struct j1939_session *session) { if (hrtimer_cancel(&session->rxtimer)) j1939_session_put(session); } void j1939_session_timers_cancel(struct j1939_session *session) { j1939_session_txtimer_cancel(session); j1939_session_rxtimer_cancel(session); } static inline bool j1939_cb_is_broadcast(const struct j1939_sk_buff_cb *skcb) { return (!skcb->addr.dst_name && (skcb->addr.da == 0xff)); } static void j1939_session_skb_drop_old(struct j1939_session *session) { struct sk_buff *do_skb; struct j1939_sk_buff_cb *do_skcb; unsigned int offset_start; unsigned long flags; if (skb_queue_len(&session->skb_queue) < 2) return; offset_start = session->pkt.tx_acked * 7; spin_lock_irqsave(&session->skb_queue.lock, flags); do_skb = skb_peek(&session->skb_queue); do_skcb = j1939_skb_to_cb(do_skb); if ((do_skcb->offset + do_skb->len) < offset_start) { __skb_unlink(do_skb, &session->skb_queue); /* drop ref taken in j1939_session_skb_queue() */ skb_unref(do_skb); spin_unlock_irqrestore(&session->skb_queue.lock, flags); kfree_skb(do_skb); } else { spin_unlock_irqrestore(&session->skb_queue.lock, flags); } } void j1939_session_skb_queue(struct j1939_session *session, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct j1939_priv *priv = session->priv; j1939_ac_fixup(priv, skb); if (j1939_address_is_unicast(skcb->addr.da) && priv->ents[skcb->addr.da].nusers) skcb->flags |= J1939_ECU_LOCAL_DST; skcb->flags |= J1939_ECU_LOCAL_SRC; skb_get(skb); skb_queue_tail(&session->skb_queue, skb); } static struct sk_buff *j1939_session_skb_get_by_offset(struct j1939_session *session, unsigned int offset_start) { struct j1939_priv *priv = session->priv; struct j1939_sk_buff_cb *do_skcb; struct sk_buff *skb = NULL; struct sk_buff *do_skb; unsigned long flags; spin_lock_irqsave(&session->skb_queue.lock, flags); skb_queue_walk(&session->skb_queue, do_skb) { do_skcb = j1939_skb_to_cb(do_skb); if (offset_start >= do_skcb->offset && offset_start < (do_skcb->offset + do_skb->len)) { skb = do_skb; } } if (skb) skb_get(skb); spin_unlock_irqrestore(&session->skb_queue.lock, flags); if (!skb) netdev_dbg(priv->ndev, "%s: 0x%p: no skb found for start: %i, queue size: %i\n", __func__, session, offset_start, skb_queue_len(&session->skb_queue)); return skb; } static struct sk_buff *j1939_session_skb_get(struct j1939_session *session) { unsigned int offset_start; offset_start = session->pkt.dpo * 7; return j1939_session_skb_get_by_offset(session, offset_start); } /* see if we are receiver * returns 0 for broadcasts, although we will receive them */ static inline int j1939_tp_im_receiver(const struct j1939_sk_buff_cb *skcb) { return skcb->flags & J1939_ECU_LOCAL_DST; } /* see if we are sender */ static inline int j1939_tp_im_transmitter(const struct j1939_sk_buff_cb *skcb) { return skcb->flags & J1939_ECU_LOCAL_SRC; } /* see if we are involved as either receiver or transmitter */ static int j1939_tp_im_involved(const struct j1939_sk_buff_cb *skcb, bool swap) { if (swap) return j1939_tp_im_receiver(skcb); else return j1939_tp_im_transmitter(skcb); } static int j1939_tp_im_involved_anydir(struct j1939_sk_buff_cb *skcb) { return skcb->flags & (J1939_ECU_LOCAL_SRC | J1939_ECU_LOCAL_DST); } /* extract pgn from flow-ctl message */ static inline pgn_t j1939_xtp_ctl_to_pgn(const u8 *dat) { pgn_t pgn; pgn = (dat[7] << 16) | (dat[6] << 8) | (dat[5] << 0); if (j1939_pgn_is_pdu1(pgn)) pgn &= 0xffff00; return pgn; } static inline unsigned int j1939_tp_ctl_to_size(const u8 *dat) { return (dat[2] << 8) + (dat[1] << 0); } static inline unsigned int j1939_etp_ctl_to_packet(const u8 *dat) { return (dat[4] << 16) | (dat[3] << 8) | (dat[2] << 0); } static inline unsigned int j1939_etp_ctl_to_size(const u8 *dat) { return (dat[4] << 24) | (dat[3] << 16) | (dat[2] << 8) | (dat[1] << 0); } /* find existing session: * reverse: swap cb's src & dst * there is no problem with matching broadcasts, since * broadcasts (no dst, no da) would never call this * with reverse == true */ static bool j1939_session_match(struct j1939_addr *se_addr, struct j1939_addr *sk_addr, bool reverse) { if (se_addr->type != sk_addr->type) return false; if (reverse) { if (se_addr->src_name) { if (se_addr->src_name != sk_addr->dst_name) return false; } else if (se_addr->sa != sk_addr->da) { return false; } if (se_addr->dst_name) { if (se_addr->dst_name != sk_addr->src_name) return false; } else if (se_addr->da != sk_addr->sa) { return false; } } else { if (se_addr->src_name) { if (se_addr->src_name != sk_addr->src_name) return false; } else if (se_addr->sa != sk_addr->sa) { return false; } if (se_addr->dst_name) { if (se_addr->dst_name != sk_addr->dst_name) return false; } else if (se_addr->da != sk_addr->da) { return false; } } return true; } static struct j1939_session *j1939_session_get_by_addr_locked(struct j1939_priv *priv, struct list_head *root, struct j1939_addr *addr, bool reverse, bool transmitter) { struct j1939_session *session; lockdep_assert_held(&priv->active_session_list_lock); list_for_each_entry(session, root, active_session_list_entry) { j1939_session_get(session); if (j1939_session_match(&session->skcb.addr, addr, reverse) && session->transmission == transmitter) return session; j1939_session_put(session); } return NULL; } static struct j1939_session *j1939_session_get_simple(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct j1939_session *session; lockdep_assert_held(&priv->active_session_list_lock); list_for_each_entry(session, &priv->active_session_list, active_session_list_entry) { j1939_session_get(session); if (session->skcb.addr.type == J1939_SIMPLE && session->tskey == skcb->tskey && session->sk == skb->sk) return session; j1939_session_put(session); } return NULL; } static struct j1939_session *j1939_session_get_by_addr(struct j1939_priv *priv, struct j1939_addr *addr, bool reverse, bool transmitter) { struct j1939_session *session; j1939_session_list_lock(priv); session = j1939_session_get_by_addr_locked(priv, &priv->active_session_list, addr, reverse, transmitter); j1939_session_list_unlock(priv); return session; } static void j1939_skbcb_swap(struct j1939_sk_buff_cb *skcb) { u8 tmp = 0; swap(skcb->addr.dst_name, skcb->addr.src_name); swap(skcb->addr.da, skcb->addr.sa); /* swap SRC and DST flags, leave other untouched */ if (skcb->flags & J1939_ECU_LOCAL_SRC) tmp |= J1939_ECU_LOCAL_DST; if (skcb->flags & J1939_ECU_LOCAL_DST) tmp |= J1939_ECU_LOCAL_SRC; skcb->flags &= ~(J1939_ECU_LOCAL_SRC | J1939_ECU_LOCAL_DST); skcb->flags |= tmp; } static struct sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv, const struct j1939_sk_buff_cb *re_skcb, bool ctl, bool swap_src_dst) { struct sk_buff *skb; struct j1939_sk_buff_cb *skcb; skb = alloc_skb(sizeof(struct can_frame) + sizeof(struct can_skb_priv), GFP_ATOMIC); if (unlikely(!skb)) return ERR_PTR(-ENOMEM); skb->dev = priv->ndev; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = priv->ndev->ifindex; can_skb_prv(skb)->skbcnt = 0; /* reserve CAN header */ skb_reserve(skb, offsetof(struct can_frame, data)); /* skb->cb must be large enough to hold a j1939_sk_buff_cb structure */ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*re_skcb)); memcpy(skb->cb, re_skcb, sizeof(*re_skcb)); skcb = j1939_skb_to_cb(skb); if (swap_src_dst) j1939_skbcb_swap(skcb); if (ctl) { if (skcb->addr.type == J1939_ETP) skcb->addr.pgn = J1939_ETP_PGN_CTL; else skcb->addr.pgn = J1939_TP_PGN_CTL; } else { if (skcb->addr.type == J1939_ETP) skcb->addr.pgn = J1939_ETP_PGN_DAT; else skcb->addr.pgn = J1939_TP_PGN_DAT; } return skb; } /* TP transmit packet functions */ static int j1939_tp_tx_dat(struct j1939_session *session, const u8 *dat, int len) { struct j1939_priv *priv = session->priv; struct sk_buff *skb; skb = j1939_tp_tx_dat_new(priv, &session->skcb, false, false); if (IS_ERR(skb)) return PTR_ERR(skb); skb_put_data(skb, dat, len); if (j1939_tp_padding && len < 8) memset(skb_put(skb, 8 - len), 0xff, 8 - len); return j1939_send_one(priv, skb); } static int j1939_xtp_do_tx_ctl(struct j1939_priv *priv, const struct j1939_sk_buff_cb *re_skcb, bool swap_src_dst, pgn_t pgn, const u8 *dat) { struct sk_buff *skb; u8 *skdat; if (!j1939_tp_im_involved(re_skcb, swap_src_dst)) return 0; skb = j1939_tp_tx_dat_new(priv, re_skcb, true, swap_src_dst); if (IS_ERR(skb)) return PTR_ERR(skb); skdat = skb_put(skb, 8); memcpy(skdat, dat, 5); skdat[5] = (pgn >> 0); skdat[6] = (pgn >> 8); skdat[7] = (pgn >> 16); return j1939_send_one(priv, skb); } static inline int j1939_tp_tx_ctl(struct j1939_session *session, bool swap_src_dst, const u8 *dat) { struct j1939_priv *priv = session->priv; return j1939_xtp_do_tx_ctl(priv, &session->skcb, swap_src_dst, session->skcb.addr.pgn, dat); } static int j1939_xtp_tx_abort(struct j1939_priv *priv, const struct j1939_sk_buff_cb *re_skcb, bool swap_src_dst, enum j1939_xtp_abort err, pgn_t pgn) { u8 dat[5]; if (!j1939_tp_im_involved(re_skcb, swap_src_dst)) return 0; memset(dat, 0xff, sizeof(dat)); dat[0] = J1939_TP_CMD_ABORT; dat[1] = err; return j1939_xtp_do_tx_ctl(priv, re_skcb, swap_src_dst, pgn, dat); } void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec) { j1939_session_get(session); hrtimer_start(&session->txtimer, ms_to_ktime(msec), HRTIMER_MODE_REL_SOFT); } static inline void j1939_tp_set_rxtimeout(struct j1939_session *session, int msec) { j1939_session_rxtimer_cancel(session); j1939_session_get(session); hrtimer_start(&session->rxtimer, ms_to_ktime(msec), HRTIMER_MODE_REL_SOFT); } static int j1939_session_tx_rts(struct j1939_session *session) { u8 dat[8]; int ret; memset(dat, 0xff, sizeof(dat)); dat[1] = (session->total_message_size >> 0); dat[2] = (session->total_message_size >> 8); dat[3] = session->pkt.total; if (session->skcb.addr.type == J1939_ETP) { dat[0] = J1939_ETP_CMD_RTS; dat[1] = (session->total_message_size >> 0); dat[2] = (session->total_message_size >> 8); dat[3] = (session->total_message_size >> 16); dat[4] = (session->total_message_size >> 24); } else if (j1939_cb_is_broadcast(&session->skcb)) { dat[0] = J1939_TP_CMD_BAM; /* fake cts for broadcast */ session->pkt.tx = 0; } else { dat[0] = J1939_TP_CMD_RTS; dat[4] = dat[3]; } if (dat[0] == session->last_txcmd) /* done already */ return 0; ret = j1939_tp_tx_ctl(session, false, dat); if (ret < 0) return ret; session->last_txcmd = dat[0]; if (dat[0] == J1939_TP_CMD_BAM) { j1939_tp_schedule_txtimer(session, 50); j1939_tp_set_rxtimeout(session, 250); } else { j1939_tp_set_rxtimeout(session, 1250); } netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); return 0; } static int j1939_session_tx_dpo(struct j1939_session *session) { unsigned int pkt; u8 dat[8]; int ret; memset(dat, 0xff, sizeof(dat)); dat[0] = J1939_ETP_CMD_DPO; session->pkt.dpo = session->pkt.tx_acked; pkt = session->pkt.dpo; dat[1] = session->pkt.last - session->pkt.tx_acked; dat[2] = (pkt >> 0); dat[3] = (pkt >> 8); dat[4] = (pkt >> 16); ret = j1939_tp_tx_ctl(session, false, dat); if (ret < 0) return ret; session->last_txcmd = dat[0]; j1939_tp_set_rxtimeout(session, 1250); session->pkt.tx = session->pkt.tx_acked; netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); return 0; } static int j1939_session_tx_dat(struct j1939_session *session) { struct j1939_priv *priv = session->priv; struct j1939_sk_buff_cb *se_skcb; int offset, pkt_done, pkt_end; unsigned int len, pdelay; struct sk_buff *se_skb; const u8 *tpdat; int ret = 0; u8 dat[8]; se_skb = j1939_session_skb_get_by_offset(session, session->pkt.tx * 7); if (!se_skb) return -ENOBUFS; se_skcb = j1939_skb_to_cb(se_skb); tpdat = se_skb->data; ret = 0; pkt_done = 0; if (session->skcb.addr.type != J1939_ETP && j1939_cb_is_broadcast(&session->skcb)) pkt_end = session->pkt.total; else pkt_end = session->pkt.last; while (session->pkt.tx < pkt_end) { dat[0] = session->pkt.tx - session->pkt.dpo + 1; offset = (session->pkt.tx * 7) - se_skcb->offset; len = se_skb->len - offset; if (len > 7) len = 7; if (offset + len > se_skb->len) { netdev_err_once(priv->ndev, "%s: 0x%p: requested data outside of queued buffer: offset %i, len %i, pkt.tx: %i\n", __func__, session, se_skcb->offset, se_skb->len , session->pkt.tx); ret = -EOVERFLOW; goto out_free; } if (!len) { ret = -ENOBUFS; break; } memcpy(&dat[1], &tpdat[offset], len); ret = j1939_tp_tx_dat(session, dat, len + 1); if (ret < 0) { /* ENOBUFS == CAN interface TX queue is full */ if (ret != -ENOBUFS) netdev_alert(priv->ndev, "%s: 0x%p: queue data error: %i\n", __func__, session, ret); break; } session->last_txcmd = 0xff; pkt_done++; session->pkt.tx++; pdelay = j1939_cb_is_broadcast(&session->skcb) ? 50 : j1939_tp_packet_delay; if (session->pkt.tx < session->pkt.total && pdelay) { j1939_tp_schedule_txtimer(session, pdelay); break; } } if (pkt_done) j1939_tp_set_rxtimeout(session, 250); out_free: if (ret) kfree_skb(se_skb); else consume_skb(se_skb); return ret; } static int j1939_xtp_txnext_transmiter(struct j1939_session *session) { struct j1939_priv *priv = session->priv; int ret = 0; if (!j1939_tp_im_transmitter(&session->skcb)) { netdev_alert(priv->ndev, "%s: 0x%p: called by not transmitter!\n", __func__, session); return -EINVAL; } switch (session->last_cmd) { case 0: ret = j1939_session_tx_rts(session); break; case J1939_ETP_CMD_CTS: if (session->last_txcmd != J1939_ETP_CMD_DPO) { ret = j1939_session_tx_dpo(session); if (ret) return ret; } fallthrough; case J1939_TP_CMD_CTS: case 0xff: /* did some data */ case J1939_ETP_CMD_DPO: case J1939_TP_CMD_BAM: ret = j1939_session_tx_dat(session); break; default: netdev_alert(priv->ndev, "%s: 0x%p: unexpected last_cmd: %x\n", __func__, session, session->last_cmd); } return ret; } static int j1939_session_tx_cts(struct j1939_session *session) { struct j1939_priv *priv = session->priv; unsigned int pkt, len; int ret; u8 dat[8]; if (!j1939_sk_recv_match(priv, &session->skcb)) return -ENOENT; len = session->pkt.total - session->pkt.rx; len = min3(len, session->pkt.block, j1939_tp_block ?: 255); memset(dat, 0xff, sizeof(dat)); if (session->skcb.addr.type == J1939_ETP) { pkt = session->pkt.rx + 1; dat[0] = J1939_ETP_CMD_CTS; dat[1] = len; dat[2] = (pkt >> 0); dat[3] = (pkt >> 8); dat[4] = (pkt >> 16); } else { dat[0] = J1939_TP_CMD_CTS; dat[1] = len; dat[2] = session->pkt.rx + 1; } if (dat[0] == session->last_txcmd) /* done already */ return 0; ret = j1939_tp_tx_ctl(session, true, dat); if (ret < 0) return ret; if (len) /* only mark cts done when len is set */ session->last_txcmd = dat[0]; j1939_tp_set_rxtimeout(session, 1250); netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); return 0; } static int j1939_session_tx_eoma(struct j1939_session *session) { struct j1939_priv *priv = session->priv; u8 dat[8]; int ret; if (!j1939_sk_recv_match(priv, &session->skcb)) return -ENOENT; memset(dat, 0xff, sizeof(dat)); if (session->skcb.addr.type == J1939_ETP) { dat[0] = J1939_ETP_CMD_EOMA; dat[1] = session->total_message_size >> 0; dat[2] = session->total_message_size >> 8; dat[3] = session->total_message_size >> 16; dat[4] = session->total_message_size >> 24; } else { dat[0] = J1939_TP_CMD_EOMA; dat[1] = session->total_message_size; dat[2] = session->total_message_size >> 8; dat[3] = session->pkt.total; } if (dat[0] == session->last_txcmd) /* done already */ return 0; ret = j1939_tp_tx_ctl(session, true, dat); if (ret < 0) return ret; session->last_txcmd = dat[0]; /* wait for the EOMA packet to come in */ j1939_tp_set_rxtimeout(session, 1250); netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); return 0; } static int j1939_xtp_txnext_receiver(struct j1939_session *session) { struct j1939_priv *priv = session->priv; int ret = 0; if (!j1939_tp_im_receiver(&session->skcb)) { netdev_alert(priv->ndev, "%s: 0x%p: called by not receiver!\n", __func__, session); return -EINVAL; } switch (session->last_cmd) { case J1939_TP_CMD_RTS: case J1939_ETP_CMD_RTS: ret = j1939_session_tx_cts(session); break; case J1939_ETP_CMD_CTS: case J1939_TP_CMD_CTS: case 0xff: /* did some data */ case J1939_ETP_CMD_DPO: if ((session->skcb.addr.type == J1939_TP && j1939_cb_is_broadcast(&session->skcb))) break; if (session->pkt.rx >= session->pkt.total) { ret = j1939_session_tx_eoma(session); } else if (session->pkt.rx >= session->pkt.last) { session->last_txcmd = 0; ret = j1939_session_tx_cts(session); } break; default: netdev_alert(priv->ndev, "%s: 0x%p: unexpected last_cmd: %x\n", __func__, session, session->last_cmd); } return ret; } static int j1939_simple_txnext(struct j1939_session *session) { struct j1939_priv *priv = session->priv; struct sk_buff *se_skb = j1939_session_skb_get(session); struct sk_buff *skb; int ret; if (!se_skb) return 0; skb = skb_clone(se_skb, GFP_ATOMIC); if (!skb) { ret = -ENOMEM; goto out_free; } can_skb_set_owner(skb, se_skb->sk); j1939_tp_set_rxtimeout(session, J1939_SIMPLE_ECHO_TIMEOUT_MS); ret = j1939_send_one(priv, skb); if (ret) goto out_free; j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_SCHED); j1939_sk_queue_activate_next(session); out_free: if (ret) kfree_skb(se_skb); else consume_skb(se_skb); return ret; } static bool j1939_session_deactivate_locked(struct j1939_session *session) { bool active = false; lockdep_assert_held(&session->priv->active_session_list_lock); if (session->state >= J1939_SESSION_ACTIVE && session->state < J1939_SESSION_ACTIVE_MAX) { active = true; list_del_init(&session->active_session_list_entry); session->state = J1939_SESSION_DONE; j1939_session_put(session); } return active; } static bool j1939_session_deactivate(struct j1939_session *session) { struct j1939_priv *priv = session->priv; bool active; j1939_session_list_lock(priv); active = j1939_session_deactivate_locked(session); j1939_session_list_unlock(priv); return active; } static void j1939_session_deactivate_activate_next(struct j1939_session *session) { if (j1939_session_deactivate(session)) j1939_sk_queue_activate_next(session); } static void __j1939_session_cancel(struct j1939_session *session, enum j1939_xtp_abort err) { struct j1939_priv *priv = session->priv; WARN_ON_ONCE(!err); lockdep_assert_held(&session->priv->active_session_list_lock); session->err = j1939_xtp_abort_to_errno(priv, err); session->state = J1939_SESSION_WAITING_ABORT; /* do not send aborts on incoming broadcasts */ if (!j1939_cb_is_broadcast(&session->skcb)) { j1939_xtp_tx_abort(priv, &session->skcb, !session->transmission, err, session->skcb.addr.pgn); } if (session->sk) j1939_sk_send_loop_abort(session->sk, session->err); } static void j1939_session_cancel(struct j1939_session *session, enum j1939_xtp_abort err) { j1939_session_list_lock(session->priv); if (session->state >= J1939_SESSION_ACTIVE && session->state < J1939_SESSION_WAITING_ABORT) { j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS); __j1939_session_cancel(session, err); } j1939_session_list_unlock(session->priv); if (!session->sk) j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT); } static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer) { struct j1939_session *session = container_of(hrtimer, struct j1939_session, txtimer); struct j1939_priv *priv = session->priv; int ret = 0; if (session->skcb.addr.type == J1939_SIMPLE) { ret = j1939_simple_txnext(session); } else { if (session->transmission) ret = j1939_xtp_txnext_transmiter(session); else ret = j1939_xtp_txnext_receiver(session); } switch (ret) { case -ENOBUFS: /* Retry limit is currently arbitrary chosen */ if (session->tx_retry < J1939_XTP_TX_RETRY_LIMIT) { session->tx_retry++; j1939_tp_schedule_txtimer(session, 10 + get_random_u32_below(16)); } else { netdev_alert(priv->ndev, "%s: 0x%p: tx retry count reached\n", __func__, session); session->err = -ENETUNREACH; j1939_session_rxtimer_cancel(session); j1939_session_deactivate_activate_next(session); } break; case -ENETDOWN: /* In this case we should get a netdev_event(), all active * sessions will be cleared by j1939_cancel_active_session(). * So handle this as an error, but let * j1939_cancel_active_session() do the cleanup including * propagation of the error to user space. */ break; case -EOVERFLOW: j1939_session_cancel(session, J1939_XTP_ABORT_ECTS_TOO_BIG); break; case 0: session->tx_retry = 0; break; default: netdev_alert(priv->ndev, "%s: 0x%p: tx aborted with unknown reason: %i\n", __func__, session, ret); if (session->skcb.addr.type != J1939_SIMPLE) { j1939_session_cancel(session, J1939_XTP_ABORT_OTHER); } else { session->err = ret; j1939_session_rxtimer_cancel(session); j1939_session_deactivate_activate_next(session); } } j1939_session_put(session); return HRTIMER_NORESTART; } static void j1939_session_completed(struct j1939_session *session) { struct sk_buff *se_skb; if (!session->transmission) { se_skb = j1939_session_skb_get(session); /* distribute among j1939 receivers */ j1939_sk_recv(session->priv, se_skb); consume_skb(se_skb); } j1939_session_deactivate_activate_next(session); } static enum hrtimer_restart j1939_tp_rxtimer(struct hrtimer *hrtimer) { struct j1939_session *session = container_of(hrtimer, struct j1939_session, rxtimer); struct j1939_priv *priv = session->priv; if (session->state == J1939_SESSION_WAITING_ABORT) { netdev_alert(priv->ndev, "%s: 0x%p: abort rx timeout. Force session deactivation\n", __func__, session); j1939_session_deactivate_activate_next(session); } else if (session->skcb.addr.type == J1939_SIMPLE) { netdev_alert(priv->ndev, "%s: 0x%p: Timeout. Failed to send simple message.\n", __func__, session); /* The message is probably stuck in the CAN controller and can * be send as soon as CAN bus is in working state again. */ session->err = -ETIME; j1939_session_deactivate(session); } else { j1939_session_list_lock(session->priv); if (session->state >= J1939_SESSION_ACTIVE && session->state < J1939_SESSION_ACTIVE_MAX) { netdev_alert(priv->ndev, "%s: 0x%p: rx timeout, send abort\n", __func__, session); j1939_session_get(session); hrtimer_start(&session->rxtimer, ms_to_ktime(J1939_XTP_ABORT_TIMEOUT_MS), HRTIMER_MODE_REL_SOFT); __j1939_session_cancel(session, J1939_XTP_ABORT_TIMEOUT); } j1939_session_list_unlock(session->priv); if (!session->sk) j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT); } j1939_session_put(session); return HRTIMER_NORESTART; } static bool j1939_xtp_rx_cmd_bad_pgn(struct j1939_session *session, const struct sk_buff *skb) { const struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); pgn_t pgn = j1939_xtp_ctl_to_pgn(skb->data); struct j1939_priv *priv = session->priv; enum j1939_xtp_abort abort = J1939_XTP_NO_ABORT; u8 cmd = skb->data[0]; if (session->skcb.addr.pgn == pgn) return false; switch (cmd) { case J1939_TP_CMD_BAM: abort = J1939_XTP_NO_ABORT; break; case J1939_ETP_CMD_RTS: fallthrough; case J1939_TP_CMD_RTS: abort = J1939_XTP_ABORT_BUSY; break; case J1939_ETP_CMD_CTS: fallthrough; case J1939_TP_CMD_CTS: abort = J1939_XTP_ABORT_ECTS_UNXPECTED_PGN; break; case J1939_ETP_CMD_DPO: abort = J1939_XTP_ABORT_BAD_EDPO_PGN; break; case J1939_ETP_CMD_EOMA: fallthrough; case J1939_TP_CMD_EOMA: abort = J1939_XTP_ABORT_OTHER; break; case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */ abort = J1939_XTP_NO_ABORT; break; default: WARN_ON_ONCE(1); break; } netdev_warn(priv->ndev, "%s: 0x%p: CMD 0x%02x with PGN 0x%05x for running session with different PGN 0x%05x.\n", __func__, session, cmd, pgn, session->skcb.addr.pgn); if (abort != J1939_XTP_NO_ABORT) j1939_xtp_tx_abort(priv, skcb, true, abort, pgn); return true; } static void j1939_xtp_rx_abort_one(struct j1939_priv *priv, struct sk_buff *skb, bool reverse, bool transmitter) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct j1939_session *session; u8 abort = skb->data[1]; session = j1939_session_get_by_addr(priv, &skcb->addr, reverse, transmitter); if (!session) return; if (j1939_xtp_rx_cmd_bad_pgn(session, skb)) goto abort_put; netdev_info(priv->ndev, "%s: 0x%p: 0x%05x: (%u) %s\n", __func__, session, j1939_xtp_ctl_to_pgn(skb->data), abort, j1939_xtp_abort_to_str(abort)); j1939_session_timers_cancel(session); session->err = j1939_xtp_abort_to_errno(priv, abort); if (session->sk) j1939_sk_send_loop_abort(session->sk, session->err); else j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT); j1939_session_deactivate_activate_next(session); abort_put: j1939_session_put(session); } /* abort packets may come in 2 directions */ static void j1939_xtp_rx_abort(struct j1939_priv *priv, struct sk_buff *skb, bool transmitter) { j1939_xtp_rx_abort_one(priv, skb, false, transmitter); j1939_xtp_rx_abort_one(priv, skb, true, transmitter); } static void j1939_xtp_rx_eoma_one(struct j1939_session *session, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); const u8 *dat; int len; if (j1939_xtp_rx_cmd_bad_pgn(session, skb)) return; dat = skb->data; if (skcb->addr.type == J1939_ETP) len = j1939_etp_ctl_to_size(dat); else len = j1939_tp_ctl_to_size(dat); if (session->total_message_size != len) { netdev_warn_once(session->priv->ndev, "%s: 0x%p: Incorrect size. Expected: %i; got: %i.\n", __func__, session, session->total_message_size, len); } netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); session->pkt.tx_acked = session->pkt.total; j1939_session_timers_cancel(session); /* transmitted without problems */ j1939_session_completed(session); } static void j1939_xtp_rx_eoma(struct j1939_priv *priv, struct sk_buff *skb, bool transmitter) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct j1939_session *session; session = j1939_session_get_by_addr(priv, &skcb->addr, true, transmitter); if (!session) return; j1939_xtp_rx_eoma_one(session, skb); j1939_session_put(session); } static void j1939_xtp_rx_cts_one(struct j1939_session *session, struct sk_buff *skb) { enum j1939_xtp_abort err = J1939_XTP_ABORT_FAULT; unsigned int pkt; const u8 *dat; dat = skb->data; if (j1939_xtp_rx_cmd_bad_pgn(session, skb)) return; netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); if (session->last_cmd == dat[0]) { err = J1939_XTP_ABORT_DUP_SEQ; goto out_session_cancel; } if (session->skcb.addr.type == J1939_ETP) pkt = j1939_etp_ctl_to_packet(dat); else pkt = dat[2]; if (!pkt) goto out_session_cancel; else if (dat[1] > session->pkt.block /* 0xff for etp */) goto out_session_cancel; /* set packet counters only when not CTS(0) */ session->pkt.tx_acked = pkt - 1; j1939_session_skb_drop_old(session); session->pkt.last = session->pkt.tx_acked + dat[1]; if (session->pkt.last > session->pkt.total) /* safety measure */ session->pkt.last = session->pkt.total; /* TODO: do not set tx here, do it in txtimer */ session->pkt.tx = session->pkt.tx_acked; session->last_cmd = dat[0]; if (dat[1]) { j1939_tp_set_rxtimeout(session, 1250); if (session->transmission) { if (session->pkt.tx_acked) j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_SCHED); j1939_session_txtimer_cancel(session); j1939_tp_schedule_txtimer(session, 0); } } else { /* CTS(0) */ j1939_tp_set_rxtimeout(session, 550); } return; out_session_cancel: j1939_session_timers_cancel(session); j1939_session_cancel(session, err); } static void j1939_xtp_rx_cts(struct j1939_priv *priv, struct sk_buff *skb, bool transmitter) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct j1939_session *session; session = j1939_session_get_by_addr(priv, &skcb->addr, true, transmitter); if (!session) return; j1939_xtp_rx_cts_one(session, skb); j1939_session_put(session); } static struct j1939_session *j1939_session_new(struct j1939_priv *priv, struct sk_buff *skb, size_t size) { struct j1939_session *session; struct j1939_sk_buff_cb *skcb; session = kzalloc(sizeof(*session), gfp_any()); if (!session) return NULL; INIT_LIST_HEAD(&session->active_session_list_entry); INIT_LIST_HEAD(&session->sk_session_queue_entry); kref_init(&session->kref); j1939_priv_get(priv); session->priv = priv; session->total_message_size = size; session->state = J1939_SESSION_NEW; skb_queue_head_init(&session->skb_queue); skb_queue_tail(&session->skb_queue, skb_get(skb)); skcb = j1939_skb_to_cb(skb); memcpy(&session->skcb, skcb, sizeof(session->skcb)); hrtimer_init(&session->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); session->txtimer.function = j1939_tp_txtimer; hrtimer_init(&session->rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); session->rxtimer.function = j1939_tp_rxtimer; netdev_dbg(priv->ndev, "%s: 0x%p: sa: %02x, da: %02x\n", __func__, session, skcb->addr.sa, skcb->addr.da); return session; } static struct j1939_session *j1939_session_fresh_new(struct j1939_priv *priv, int size, const struct j1939_sk_buff_cb *rel_skcb) { struct sk_buff *skb; struct j1939_sk_buff_cb *skcb; struct j1939_session *session; skb = alloc_skb(size + sizeof(struct can_skb_priv), GFP_ATOMIC); if (unlikely(!skb)) return NULL; skb->dev = priv->ndev; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = priv->ndev->ifindex; can_skb_prv(skb)->skbcnt = 0; skcb = j1939_skb_to_cb(skb); memcpy(skcb, rel_skcb, sizeof(*skcb)); session = j1939_session_new(priv, skb, size); if (!session) { kfree_skb(skb); return NULL; } /* alloc data area */ skb_put(skb, size); /* skb is recounted in j1939_session_new() */ return session; } int j1939_session_activate(struct j1939_session *session) { struct j1939_priv *priv = session->priv; struct j1939_session *active = NULL; int ret = 0; j1939_session_list_lock(priv); if (session->skcb.addr.type != J1939_SIMPLE) active = j1939_session_get_by_addr_locked(priv, &priv->active_session_list, &session->skcb.addr, false, session->transmission); if (active) { j1939_session_put(active); ret = -EAGAIN; } else { WARN_ON_ONCE(session->state != J1939_SESSION_NEW); list_add_tail(&session->active_session_list_entry, &priv->active_session_list); j1939_session_get(session); session->state = J1939_SESSION_ACTIVE; netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); } j1939_session_list_unlock(priv); return ret; } static struct j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv, struct sk_buff *skb) { enum j1939_xtp_abort abort = J1939_XTP_NO_ABORT; struct j1939_sk_buff_cb skcb = *j1939_skb_to_cb(skb); struct j1939_session *session; const u8 *dat; int len, ret; pgn_t pgn; netdev_dbg(priv->ndev, "%s\n", __func__); dat = skb->data; pgn = j1939_xtp_ctl_to_pgn(dat); skcb.addr.pgn = pgn; if (!j1939_sk_recv_match(priv, &skcb)) return NULL; if (skcb.addr.type == J1939_ETP) { len = j1939_etp_ctl_to_size(dat); if (len > J1939_MAX_ETP_PACKET_SIZE) abort = J1939_XTP_ABORT_FAULT; else if (len > priv->tp_max_packet_size) abort = J1939_XTP_ABORT_RESOURCE; else if (len <= J1939_MAX_TP_PACKET_SIZE) abort = J1939_XTP_ABORT_FAULT; } else { len = j1939_tp_ctl_to_size(dat); if (len > J1939_MAX_TP_PACKET_SIZE) abort = J1939_XTP_ABORT_FAULT; else if (len > priv->tp_max_packet_size) abort = J1939_XTP_ABORT_RESOURCE; else if (len < J1939_MIN_TP_PACKET_SIZE) abort = J1939_XTP_ABORT_FAULT; } if (abort != J1939_XTP_NO_ABORT) { j1939_xtp_tx_abort(priv, &skcb, true, abort, pgn); return NULL; } session = j1939_session_fresh_new(priv, len, &skcb); if (!session) { j1939_xtp_tx_abort(priv, &skcb, true, J1939_XTP_ABORT_RESOURCE, pgn); return NULL; } /* initialize the control buffer: plain copy */ session->pkt.total = (len + 6) / 7; session->pkt.block = 0xff; if (skcb.addr.type != J1939_ETP) { if (dat[3] != session->pkt.total) netdev_alert(priv->ndev, "%s: 0x%p: strange total, %u != %u\n", __func__, session, session->pkt.total, dat[3]); session->pkt.total = dat[3]; session->pkt.block = min(dat[3], dat[4]); } session->pkt.rx = 0; session->pkt.tx = 0; session->tskey = priv->rx_tskey++; j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_RTS); ret = j1939_session_activate(session); if (ret) { /* Entering this scope indicates an issue with the J1939 bus. * Possible scenarios include: * - A time lapse occurred, and a new session was initiated * due to another packet being sent correctly. This could * have been caused by too long interrupt, debugger, or being * out-scheduled by another task. * - The bus is receiving numerous erroneous packets, either * from a malfunctioning device or during a test scenario. */ netdev_alert(priv->ndev, "%s: 0x%p: concurrent session with same addr (%02x %02x) is already active.\n", __func__, session, skcb.addr.sa, skcb.addr.da); j1939_session_put(session); return NULL; } return session; } static int j1939_xtp_rx_rts_session_active(struct j1939_session *session, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct j1939_priv *priv = session->priv; if (!session->transmission) { if (j1939_xtp_rx_cmd_bad_pgn(session, skb)) return -EBUSY; /* RTS on active session */ j1939_session_timers_cancel(session); j1939_session_cancel(session, J1939_XTP_ABORT_BUSY); } if (session->last_cmd != 0) { /* we received a second rts on the same connection */ netdev_alert(priv->ndev, "%s: 0x%p: connection exists (%02x %02x). last cmd: %x\n", __func__, session, skcb->addr.sa, skcb->addr.da, session->last_cmd); j1939_session_timers_cancel(session); j1939_session_cancel(session, J1939_XTP_ABORT_BUSY); if (session->transmission) j1939_session_deactivate_activate_next(session); return -EBUSY; } if (session->skcb.addr.sa != skcb->addr.sa || session->skcb.addr.da != skcb->addr.da) netdev_warn(priv->ndev, "%s: 0x%p: session->skcb.addr.sa=0x%02x skcb->addr.sa=0x%02x session->skcb.addr.da=0x%02x skcb->addr.da=0x%02x\n", __func__, session, session->skcb.addr.sa, skcb->addr.sa, session->skcb.addr.da, skcb->addr.da); /* make sure 'sa' & 'da' are correct ! * They may be 'not filled in yet' for sending * skb's, since they did not pass the Address Claim ever. */ session->skcb.addr.sa = skcb->addr.sa; session->skcb.addr.da = skcb->addr.da; netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); return 0; } static void j1939_xtp_rx_rts(struct j1939_priv *priv, struct sk_buff *skb, bool transmitter) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct j1939_session *session; u8 cmd = skb->data[0]; session = j1939_session_get_by_addr(priv, &skcb->addr, false, transmitter); if (!session) { if (transmitter) { /* If we're the transmitter and this function is called, * we received our own RTS. A session has already been * created. * * For some reasons however it might have been destroyed * already. So don't create a new one here (using * "j1939_xtp_rx_rts_session_new()") as this will be a * receiver session. * * The reasons the session is already destroyed might * be: * - user space closed socket was and the session was * aborted * - session was aborted due to external abort message */ return; } session = j1939_xtp_rx_rts_session_new(priv, skb); if (!session) { if (cmd == J1939_TP_CMD_BAM && j1939_sk_recv_match(priv, skcb)) netdev_info(priv->ndev, "%s: failed to create TP BAM session\n", __func__); return; } } else { if (j1939_xtp_rx_rts_session_active(session, skb)) { j1939_session_put(session); return; } } session->last_cmd = cmd; if (cmd == J1939_TP_CMD_BAM) { if (!session->transmission) j1939_tp_set_rxtimeout(session, 750); } else { if (!session->transmission) { j1939_session_txtimer_cancel(session); j1939_tp_schedule_txtimer(session, 0); } j1939_tp_set_rxtimeout(session, 1250); } j1939_session_put(session); } static void j1939_xtp_rx_dpo_one(struct j1939_session *session, struct sk_buff *skb) { const u8 *dat = skb->data; if (j1939_xtp_rx_cmd_bad_pgn(session, skb)) return; netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); /* transmitted without problems */ session->pkt.dpo = j1939_etp_ctl_to_packet(skb->data); session->last_cmd = dat[0]; j1939_tp_set_rxtimeout(session, 750); if (!session->transmission) j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_DPO); } static void j1939_xtp_rx_dpo(struct j1939_priv *priv, struct sk_buff *skb, bool transmitter) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct j1939_session *session; session = j1939_session_get_by_addr(priv, &skcb->addr, false, transmitter); if (!session) { netdev_info(priv->ndev, "%s: no connection found\n", __func__); return; } j1939_xtp_rx_dpo_one(session, skb); j1939_session_put(session); } static void j1939_xtp_rx_dat_one(struct j1939_session *session, struct sk_buff *skb) { enum j1939_xtp_abort abort = J1939_XTP_ABORT_FAULT; struct j1939_priv *priv = session->priv; struct j1939_sk_buff_cb *skcb, *se_skcb; struct sk_buff *se_skb = NULL; const u8 *dat; u8 *tpdat; int offset; int nbytes; bool final = false; bool remain = false; bool do_cts_eoma = false; int packet; skcb = j1939_skb_to_cb(skb); dat = skb->data; if (skb->len != 8) { /* makes no sense */ abort = J1939_XTP_ABORT_UNEXPECTED_DATA; goto out_session_cancel; } switch (session->last_cmd) { case 0xff: break; case J1939_ETP_CMD_DPO: if (skcb->addr.type == J1939_ETP) break; fallthrough; case J1939_TP_CMD_BAM: fallthrough; case J1939_TP_CMD_CTS: if (skcb->addr.type != J1939_ETP) break; fallthrough; default: netdev_info(priv->ndev, "%s: 0x%p: last %02x\n", __func__, session, session->last_cmd); goto out_session_cancel; } packet = (dat[0] - 1 + session->pkt.dpo); if (packet > session->pkt.total || (session->pkt.rx + 1) > session->pkt.total) { netdev_info(priv->ndev, "%s: 0x%p: should have been completed\n", __func__, session); goto out_session_cancel; } se_skb = j1939_session_skb_get_by_offset(session, packet * 7); if (!se_skb) { netdev_warn(priv->ndev, "%s: 0x%p: no skb found\n", __func__, session); goto out_session_cancel; } se_skcb = j1939_skb_to_cb(se_skb); offset = packet * 7 - se_skcb->offset; nbytes = se_skb->len - offset; if (nbytes > 7) nbytes = 7; if (nbytes <= 0 || (nbytes + 1) > skb->len) { netdev_info(priv->ndev, "%s: 0x%p: nbytes %i, len %i\n", __func__, session, nbytes, skb->len); goto out_session_cancel; } tpdat = se_skb->data; if (!session->transmission) { memcpy(&tpdat[offset], &dat[1], nbytes); } else { int err; err = memcmp(&tpdat[offset], &dat[1], nbytes); if (err) netdev_err_once(priv->ndev, "%s: 0x%p: Data of RX-looped back packet (%*ph) doesn't match TX data (%*ph)!\n", __func__, session, nbytes, &dat[1], nbytes, &tpdat[offset]); } if (packet == session->pkt.rx) session->pkt.rx++; if (se_skcb->addr.type != J1939_ETP && j1939_cb_is_broadcast(&session->skcb)) { if (session->pkt.rx >= session->pkt.total) final = true; else remain = true; } else { /* never final, an EOMA must follow */ if (session->pkt.rx >= session->pkt.last) do_cts_eoma = true; } if (final) { j1939_session_timers_cancel(session); j1939_session_completed(session); } else if (remain) { if (!session->transmission) j1939_tp_set_rxtimeout(session, 750); } else if (do_cts_eoma) { j1939_tp_set_rxtimeout(session, 1250); if (!session->transmission) j1939_tp_schedule_txtimer(session, 0); } else { j1939_tp_set_rxtimeout(session, 750); } session->last_cmd = 0xff; consume_skb(se_skb); j1939_session_put(session); return; out_session_cancel: kfree_skb(se_skb); j1939_session_timers_cancel(session); j1939_session_cancel(session, abort); j1939_session_put(session); } static void j1939_xtp_rx_dat(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb; struct j1939_session *session; skcb = j1939_skb_to_cb(skb); if (j1939_tp_im_transmitter(skcb)) { session = j1939_session_get_by_addr(priv, &skcb->addr, false, true); if (!session) netdev_info(priv->ndev, "%s: no tx connection found\n", __func__); else j1939_xtp_rx_dat_one(session, skb); } if (j1939_tp_im_receiver(skcb)) { session = j1939_session_get_by_addr(priv, &skcb->addr, false, false); if (!session) netdev_info(priv->ndev, "%s: no rx connection found\n", __func__); else j1939_xtp_rx_dat_one(session, skb); } if (j1939_cb_is_broadcast(skcb)) { session = j1939_session_get_by_addr(priv, &skcb->addr, false, false); if (session) j1939_xtp_rx_dat_one(session, skb); } } /* j1939 main intf */ struct j1939_session *j1939_tp_send(struct j1939_priv *priv, struct sk_buff *skb, size_t size) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); struct j1939_session *session; int ret; if (skcb->addr.pgn == J1939_TP_PGN_DAT || skcb->addr.pgn == J1939_TP_PGN_CTL || skcb->addr.pgn == J1939_ETP_PGN_DAT || skcb->addr.pgn == J1939_ETP_PGN_CTL) /* avoid conflict */ return ERR_PTR(-EDOM); if (size > priv->tp_max_packet_size) return ERR_PTR(-EMSGSIZE); if (size <= 8) skcb->addr.type = J1939_SIMPLE; else if (size > J1939_MAX_TP_PACKET_SIZE) skcb->addr.type = J1939_ETP; else skcb->addr.type = J1939_TP; if (skcb->addr.type == J1939_ETP && j1939_cb_is_broadcast(skcb)) return ERR_PTR(-EDESTADDRREQ); /* fill in addresses from names */ ret = j1939_ac_fixup(priv, skb); if (unlikely(ret)) return ERR_PTR(ret); /* fix DST flags, it may be used there soon */ if (j1939_address_is_unicast(skcb->addr.da) && priv->ents[skcb->addr.da].nusers) skcb->flags |= J1939_ECU_LOCAL_DST; /* src is always local, I'm sending ... */ skcb->flags |= J1939_ECU_LOCAL_SRC; /* prepare new session */ session = j1939_session_new(priv, skb, size); if (!session) return ERR_PTR(-ENOMEM); /* skb is recounted in j1939_session_new() */ sock_hold(skb->sk); session->sk = skb->sk; session->transmission = true; session->pkt.total = (size + 6) / 7; session->pkt.block = skcb->addr.type == J1939_ETP ? 255 : min(j1939_tp_block ?: 255, session->pkt.total); if (j1939_cb_is_broadcast(&session->skcb)) /* set the end-packet for broadcast */ session->pkt.last = session->pkt.total; skcb->tskey = atomic_inc_return(&session->sk->sk_tskey) - 1; session->tskey = skcb->tskey; return session; } static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); int extd = J1939_TP; u8 cmd = skb->data[0]; switch (cmd) { case J1939_ETP_CMD_RTS: extd = J1939_ETP; fallthrough; case J1939_TP_CMD_BAM: if (cmd == J1939_TP_CMD_BAM && !j1939_cb_is_broadcast(skcb)) { netdev_err_once(priv->ndev, "%s: BAM to unicast (%02x), ignoring!\n", __func__, skcb->addr.sa); return; } fallthrough; case J1939_TP_CMD_RTS: if (skcb->addr.type != extd) return; if (cmd == J1939_TP_CMD_RTS && j1939_cb_is_broadcast(skcb)) { netdev_alert(priv->ndev, "%s: rts without destination (%02x)\n", __func__, skcb->addr.sa); return; } if (j1939_tp_im_transmitter(skcb)) j1939_xtp_rx_rts(priv, skb, true); if (j1939_tp_im_receiver(skcb) || j1939_cb_is_broadcast(skcb)) j1939_xtp_rx_rts(priv, skb, false); break; case J1939_ETP_CMD_CTS: extd = J1939_ETP; fallthrough; case J1939_TP_CMD_CTS: if (skcb->addr.type != extd) return; if (j1939_tp_im_transmitter(skcb)) j1939_xtp_rx_cts(priv, skb, false); if (j1939_tp_im_receiver(skcb)) j1939_xtp_rx_cts(priv, skb, true); break; case J1939_ETP_CMD_DPO: if (skcb->addr.type != J1939_ETP) return; if (j1939_tp_im_transmitter(skcb)) j1939_xtp_rx_dpo(priv, skb, true); if (j1939_tp_im_receiver(skcb)) j1939_xtp_rx_dpo(priv, skb, false); break; case J1939_ETP_CMD_EOMA: extd = J1939_ETP; fallthrough; case J1939_TP_CMD_EOMA: if (skcb->addr.type != extd) return; if (j1939_tp_im_transmitter(skcb)) j1939_xtp_rx_eoma(priv, skb, false); if (j1939_tp_im_receiver(skcb)) j1939_xtp_rx_eoma(priv, skb, true); break; case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */ if (j1939_cb_is_broadcast(skcb)) { netdev_err_once(priv->ndev, "%s: abort to broadcast (%02x), ignoring!\n", __func__, skcb->addr.sa); return; } if (j1939_tp_im_transmitter(skcb)) j1939_xtp_rx_abort(priv, skb, true); if (j1939_tp_im_receiver(skcb)) j1939_xtp_rx_abort(priv, skb, false); break; default: return; } } int j1939_tp_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb); if (!j1939_tp_im_involved_anydir(skcb) && !j1939_cb_is_broadcast(skcb)) return 0; switch (skcb->addr.pgn) { case J1939_ETP_PGN_DAT: skcb->addr.type = J1939_ETP; fallthrough; case J1939_TP_PGN_DAT: j1939_xtp_rx_dat(priv, skb); break; case J1939_ETP_PGN_CTL: skcb->addr.type = J1939_ETP; fallthrough; case J1939_TP_PGN_CTL: if (skb->len < 8) return 0; /* Don't care. Nothing to extract here */ j1939_tp_cmd_recv(priv, skb); break; default: return 0; /* no problem */ } return 1; /* "I processed the message" */ } void j1939_simple_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_session *session; if (!skb->sk) return; if (skb->sk->sk_family != AF_CAN || skb->sk->sk_protocol != CAN_J1939) return; j1939_session_list_lock(priv); session = j1939_session_get_simple(priv, skb); j1939_session_list_unlock(priv); if (!session) { netdev_warn(priv->ndev, "%s: Received already invalidated message\n", __func__); return; } j1939_session_timers_cancel(session); j1939_session_deactivate(session); j1939_session_put(session); } int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk) { struct j1939_session *session, *saved; netdev_dbg(priv->ndev, "%s, sk: %p\n", __func__, sk); j1939_session_list_lock(priv); list_for_each_entry_safe(session, saved, &priv->active_session_list, active_session_list_entry) { if (!sk || sk == session->sk) { if (hrtimer_try_to_cancel(&session->txtimer) == 1) j1939_session_put(session); if (hrtimer_try_to_cancel(&session->rxtimer) == 1) j1939_session_put(session); session->err = ESHUTDOWN; j1939_session_deactivate_locked(session); } } j1939_session_list_unlock(priv); return NOTIFY_DONE; } void j1939_tp_init(struct j1939_priv *priv) { spin_lock_init(&priv->active_session_list_lock); INIT_LIST_HEAD(&priv->active_session_list); priv->tp_max_packet_size = J1939_MAX_ETP_PACKET_SIZE; }
9 8 3 8 3 3 3 3 3 21 21 412 412 412 412 3 3 2 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1995 Linus Torvalds * * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 * * X86-64 port * Andi Kleen. * * CPU hotplug support - ashok.raj@intel.com */ /* * This file handles the architecture-dependent parts of process handling.. */ #include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/elfcore.h> #include <linux/smp.h> #include <linux/slab.h> #include <linux/user.h> #include <linux/interrupt.h> #include <linux/delay.h> #include <linux/export.h> #include <linux/ptrace.h> #include <linux/notifier.h> #include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/prctl.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/ftrace.h> #include <linux/syscalls.h> #include <linux/iommu.h> #include <asm/processor.h> #include <asm/pkru.h> #include <asm/fpu/sched.h> #include <asm/mmu_context.h> #include <asm/prctl.h> #include <asm/desc.h> #include <asm/proto.h> #include <asm/ia32.h> #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> #include <asm/resctrl.h> #include <asm/unistd.h> #include <asm/fsgsbase.h> #include <asm/fred.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include <asm/unistd_32_ia32.h> #endif #include "process.h" /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, const char *log_lvl) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, es; show_iret_regs(regs, log_lvl); if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); else pr_cont("\n"); printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n", log_lvl, regs->ax, regs->bx, regs->cx); printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n", log_lvl, regs->dx, regs->si, regs->di); printk("%sRBP: %016lx R08: %016lx R09: %016lx\n", log_lvl, regs->bp, regs->r8, regs->r9); printk("%sR10: %016lx R11: %016lx R12: %016lx\n", log_lvl, regs->r10, regs->r11, regs->r12); printk("%sR13: %016lx R14: %016lx R15: %016lx\n", log_lvl, regs->r13, regs->r14, regs->r15); if (mode == SHOW_REGS_SHORT) return; if (mode == SHOW_REGS_USER) { rdmsrl(MSR_FS_BASE, fs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); printk("%sFS: %016lx GS: %016lx\n", log_lvl, fs, shadowgs); return; } asm("movl %%ds,%0" : "=r" (ds)); asm("movl %%es,%0" : "=r" (es)); asm("movl %%fs,%0" : "=r" (fsindex)); asm("movl %%gs,%0" : "=r" (gsindex)); rdmsrl(MSR_FS_BASE, fs); rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); cr0 = read_cr0(); cr2 = read_cr2(); cr3 = __read_cr3(); cr4 = __read_cr4(); printk("%sFS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", log_lvl, fs, fsindex, gs, gsindex, shadowgs); printk("%sCS: %04x DS: %04x ES: %04x CR0: %016lx\n", log_lvl, regs->cs, ds, es, cr0); printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n", log_lvl, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); /* Only print out debug registers if they are in their non-default state. */ if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && (d6 == DR6_RESERVED) && (d7 == 0x400))) { printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n", log_lvl, d0, d1, d2); printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n", log_lvl, d3, d6, d7); } if (cr4 & X86_CR4_PKE) printk("%sPKRU: %08x\n", log_lvl, read_pkru()); } void release_thread(struct task_struct *dead_task) { WARN_ON(dead_task->mm); } enum which_selector { FS, GS }; /* * Out of line to be protected from kprobes and tracing. If this would be * traced or probed than any access to a per CPU variable happens with * the wrong GS. * * It is not used on Xen paravirt. When paravirt support is needed, it * needs to be renamed with native_ prefix. */ static noinstr unsigned long __rdgsbase_inactive(void) { unsigned long gsbase; lockdep_assert_irqs_disabled(); /* * SWAPGS is no longer needed thus NOT allowed with FRED because * FRED transitions ensure that an operating system can _always_ * operate with its own GS base address: * - For events that occur in ring 3, FRED event delivery swaps * the GS base address with the IA32_KERNEL_GS_BASE MSR. * - ERETU (the FRED transition that returns to ring 3) also swaps * the GS base address with the IA32_KERNEL_GS_BASE MSR. * * And the operating system can still setup the GS segment for a * user thread without the need of loading a user thread GS with: * - Using LKGS, available with FRED, to modify other attributes * of the GS segment without compromising its ability always to * operate with its own GS base address. * - Accessing the GS segment base address for a user thread as * before using RDMSR or WRMSR on the IA32_KERNEL_GS_BASE MSR. * * Note, LKGS loads the GS base address into the IA32_KERNEL_GS_BASE * MSR instead of the GS segment’s descriptor cache. As such, the * operating system never changes its runtime GS base address. */ if (!cpu_feature_enabled(X86_FEATURE_FRED) && !cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); gsbase = rdgsbase(); native_swapgs(); } else { instrumentation_begin(); rdmsrl(MSR_KERNEL_GS_BASE, gsbase); instrumentation_end(); } return gsbase; } /* * Out of line to be protected from kprobes and tracing. If this would be * traced or probed than any access to a per CPU variable happens with * the wrong GS. * * It is not used on Xen paravirt. When paravirt support is needed, it * needs to be renamed with native_ prefix. */ static noinstr void __wrgsbase_inactive(unsigned long gsbase) { lockdep_assert_irqs_disabled(); if (!cpu_feature_enabled(X86_FEATURE_FRED) && !cpu_feature_enabled(X86_FEATURE_XENPV)) { native_swapgs(); wrgsbase(gsbase); native_swapgs(); } else { instrumentation_begin(); wrmsrl(MSR_KERNEL_GS_BASE, gsbase); instrumentation_end(); } } /* * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are * not available. The goal is to be reasonably fast on non-FSGSBASE systems. * It's forcibly inlined because it'll generate better code and this function * is hot. */ static __always_inline void save_base_legacy(struct task_struct *prev_p, unsigned short selector, enum which_selector which) { if (likely(selector == 0)) { /* * On Intel (without X86_BUG_NULL_SEG), the segment base could * be the pre-existing saved base or it could be zero. On AMD * (with X86_BUG_NULL_SEG), the segment base could be almost * anything. * * This branch is very hot (it's hit twice on almost every * context switch between 64-bit programs), and avoiding * the RDMSR helps a lot, so we just assume that whatever * value is already saved is correct. This matches historical * Linux behavior, so it won't break existing applications. * * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we * report that the base is zero, it needs to actually be zero: * see the corresponding logic in load_seg_legacy. */ } else { /* * If the selector is 1, 2, or 3, then the base is zero on * !X86_BUG_NULL_SEG CPUs and could be anything on * X86_BUG_NULL_SEG CPUs. In the latter case, Linux * has never attempted to preserve the base across context * switches. * * If selector > 3, then it refers to a real segment, and * saving the base isn't necessary. */ if (which == FS) prev_p->thread.fsbase = 0; else prev_p->thread.gsbase = 0; } } static __always_inline void save_fsgs(struct task_struct *task) { savesegment(fs, task->thread.fsindex); savesegment(gs, task->thread.gsindex); if (static_cpu_has(X86_FEATURE_FSGSBASE)) { /* * If FSGSBASE is enabled, we can't make any useful guesses * about the base, and user code expects us to save the current * value. Fortunately, reading the base directly is efficient. */ task->thread.fsbase = rdfsbase(); task->thread.gsbase = __rdgsbase_inactive(); } else { save_base_legacy(task, task->thread.fsindex, FS); save_base_legacy(task, task->thread.gsindex, GS); } } /* * While a process is running,current->thread.fsbase and current->thread.gsbase * may not match the corresponding CPU registers (see save_base_legacy()). */ void current_save_fsgs(void) { unsigned long flags; /* Interrupts need to be off for FSGSBASE */ local_irq_save(flags); save_fsgs(current); local_irq_restore(flags); } #if IS_ENABLED(CONFIG_KVM) EXPORT_SYMBOL_GPL(current_save_fsgs); #endif static __always_inline void loadseg(enum which_selector which, unsigned short sel) { if (which == FS) loadsegment(fs, sel); else load_gs_index(sel); } static __always_inline void load_seg_legacy(unsigned short prev_index, unsigned long prev_base, unsigned short next_index, unsigned long next_base, enum which_selector which) { if (likely(next_index <= 3)) { /* * The next task is using 64-bit TLS, is not using this * segment at all, or is having fun with arcane CPU features. */ if (next_base == 0) { /* * Nasty case: on AMD CPUs, we need to forcibly zero * the base. */ if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { loadseg(which, __USER_DS); loadseg(which, next_index); } else { /* * We could try to exhaustively detect cases * under which we can skip the segment load, * but there's really only one case that matters * for performance: if both the previous and * next states are fully zeroed, we can skip * the load. * * (This assumes that prev_base == 0 has no * false positives. This is the case on * Intel-style CPUs.) */ if (likely(prev_index | next_index | prev_base)) loadseg(which, next_index); } } else { if (prev_index != next_index) loadseg(which, next_index); wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, next_base); } } else { /* * The next task is using a real segment. Loading the selector * is sufficient. */ loadseg(which, next_index); } } /* * Store prev's PKRU value and load next's PKRU value if they differ. PKRU * is not XSTATE managed on context switch because that would require a * lookup in the task's FPU xsave buffer and require to keep that updated * in various places. */ static __always_inline void x86_pkru_load(struct thread_struct *prev, struct thread_struct *next) { if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return; /* Stash the prev task's value: */ prev->pkru = rdpkru(); /* * PKRU writes are slightly expensive. Avoid them when not * strictly necessary: */ if (prev->pkru != next->pkru) wrpkru(next->pkru); } static __always_inline void x86_fsgsbase_load(struct thread_struct *prev, struct thread_struct *next) { if (static_cpu_has(X86_FEATURE_FSGSBASE)) { /* Update the FS and GS selectors if they could have changed. */ if (unlikely(prev->fsindex || next->fsindex)) loadseg(FS, next->fsindex); if (unlikely(prev->gsindex || next->gsindex)) loadseg(GS, next->gsindex); /* Update the bases. */ wrfsbase(next->fsbase); __wrgsbase_inactive(next->gsbase); } else { load_seg_legacy(prev->fsindex, prev->fsbase, next->fsindex, next->fsbase, FS); load_seg_legacy(prev->gsindex, prev->gsbase, next->gsindex, next->gsbase, GS); } } unsigned long x86_fsgsbase_read_task(struct task_struct *task, unsigned short selector) { unsigned short idx = selector >> 3; unsigned long base; if (likely((selector & SEGMENT_TI_MASK) == 0)) { if (unlikely(idx >= GDT_ENTRIES)) return 0; /* * There are no user segments in the GDT with nonzero bases * other than the TLS segments. */ if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) return 0; idx -= GDT_ENTRY_TLS_MIN; base = get_desc_base(&task->thread.tls_array[idx]); } else { #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; /* * If performance here mattered, we could protect the LDT * with RCU. This is a slow path, though, so we can just * take the mutex. */ mutex_lock(&task->mm->context.lock); ldt = task->mm->context.ldt; if (unlikely(!ldt || idx >= ldt->nr_entries)) base = 0; else base = get_desc_base(ldt->entries + idx); mutex_unlock(&task->mm->context.lock); #else base = 0; #endif } return base; } unsigned long x86_gsbase_read_cpu_inactive(void) { unsigned long gsbase; if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { unsigned long flags; local_irq_save(flags); gsbase = __rdgsbase_inactive(); local_irq_restore(flags); } else { rdmsrl(MSR_KERNEL_GS_BASE, gsbase); } return gsbase; } void x86_gsbase_write_cpu_inactive(unsigned long gsbase) { if (boot_cpu_has(X86_FEATURE_FSGSBASE)) { unsigned long flags; local_irq_save(flags); __wrgsbase_inactive(gsbase); local_irq_restore(flags); } else { wrmsrl(MSR_KERNEL_GS_BASE, gsbase); } } unsigned long x86_fsbase_read_task(struct task_struct *task) { unsigned long fsbase; if (task == current) fsbase = x86_fsbase_read_cpu(); else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || (task->thread.fsindex == 0)) fsbase = task->thread.fsbase; else fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex); return fsbase; } unsigned long x86_gsbase_read_task(struct task_struct *task) { unsigned long gsbase; if (task == current) gsbase = x86_gsbase_read_cpu_inactive(); else if (boot_cpu_has(X86_FEATURE_FSGSBASE) || (task->thread.gsindex == 0)) gsbase = task->thread.gsbase; else gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex); return gsbase; } void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase) { WARN_ON_ONCE(task == current); task->thread.fsbase = fsbase; } void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase) { WARN_ON_ONCE(task == current); task->thread.gsbase = gsbase; } static void start_thread_common(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp, u16 _cs, u16 _ss, u16 _ds) { WARN_ON_ONCE(regs != current_pt_regs()); if (static_cpu_has(X86_BUG_NULL_SEG)) { /* Loading zero below won't clear the base. */ loadsegment(fs, __USER_DS); load_gs_index(__USER_DS); } reset_thread_features(); loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); regs->ip = new_ip; regs->sp = new_sp; regs->csx = _cs; regs->ssx = _ss; /* * Allow single-step trap and NMI when starting a new task, thus * once the new task enters user space, single-step trap and NMI * are both enabled immediately. * * Entering a new task is logically speaking a return from a * system call (exec, fork, clone, etc.). As such, if ptrace * enables single stepping a single step exception should be * allowed to trigger immediately upon entering user space. * This is not optional. * * NMI should *never* be disabled in user space. As such, this * is an optional, opportunistic way to catch errors. * * Paranoia: High-order 48 bits above the lowest 16 bit SS are * discarded by the legacy IRET instruction on all Intel, AMD, * and Cyrix/Centaur/VIA CPUs, thus can be set unconditionally, * even when FRED is not enabled. But we choose the safer side * to use these bits only when FRED is enabled. */ if (cpu_feature_enabled(X86_FEATURE_FRED)) { regs->fred_ss.swevent = true; regs->fred_ss.nmi = true; } regs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; } void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { start_thread_common(regs, new_ip, new_sp, __USER_CS, __USER_DS, 0); } EXPORT_SYMBOL_GPL(start_thread); #ifdef CONFIG_COMPAT void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) { start_thread_common(regs, new_ip, new_sp, x32 ? __USER_CS : __USER32_CS, __USER_DS, __USER_DS); } #endif /* * switch_to(x,y) should switch tasks from x to y. * * This could still be optimized: * - fold all the options into a flag word and test it with a single test. * - could test fs/gs bitsliced * * Kprobes not supported here. Set the probe on schedule instead. * Function graph tracer not supported too. */ __no_kmsan_checks __visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; int cpu = smp_processor_id(); WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(pcpu_hot.hardirq_stack_inuse)); if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) switch_fpu_prepare(prev_p, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * * (e.g. xen_load_tls()) */ save_fsgs(prev_p); /* * Load TLS before restoring any segments so that segment loads * reference the correct GDT entries. */ load_TLS(next, cpu); /* * Leave lazy mode, flushing any hypercalls made here. This * must be done after loading TLS entries in the GDT but before * loading segments that might reference them. */ arch_end_context_switch(next_p); /* Switch DS and ES. * * Reading them only returns the selectors, but writing them (if * nonzero) loads the full descriptor from the GDT or LDT. The * LDT for next is loaded in switch_mm, and the GDT is loaded * above. * * We therefore need to write new values to the segment * registers on every context switch unless both the new and old * values are zero. * * Note that we don't need to do anything for CS and SS, as * those are saved and restored as part of pt_regs. */ savesegment(es, prev->es); if (unlikely(next->es | prev->es)) loadsegment(es, next->es); savesegment(ds, prev->ds); if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); x86_fsgsbase_load(prev, next); x86_pkru_load(prev, next); /* * Switch the PDA and FPU contexts. */ raw_cpu_write(pcpu_hot.current_task, next_p); raw_cpu_write(pcpu_hot.top_of_stack, task_top_of_stack(next_p)); switch_fpu_finish(next_p); /* Reload sp0. */ update_task_stack(next_p); switch_to_extra(prev_p, next_p); if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { /* * AMD CPUs have a misfeature: SYSRET sets the SS selector but * does not update the cached descriptor. As a result, if we * do SYSRET while SS is NULL, we'll end up in user mode with * SS apparently equal to __USER_DS but actually unusable. * * The straightforward workaround would be to fix it up just * before SYSRET, but that would slow down the system call * fast paths. Instead, we ensure that SS is never NULL in * system call context. We do this by replacing NULL SS * selectors at every context switch. SYSCALL sets up a valid * SS, so the only way to get NULL is to re-enter the kernel * from CPL 3 through an interrupt. Since that can't happen * in the same task as a running syscall, we are guaranteed to * context switch between every interrupt vector entry and a * subsequent SYSRET. * * We read SS first because SS reads are much faster than * writes. Out of caution, we force SS to __KERNEL_DS even if * it previously had a different non-NULL value. */ unsigned short ss_sel; savesegment(ss, ss_sel); if (ss_sel != __KERNEL_DS) loadsegment(ss, __KERNEL_DS); } /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(next_p); return prev_p; } void set_personality_64bit(void) { /* inherit personality from parent */ /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_ADDR32); /* Pretend that this comes from a 64bit execve */ task_pt_regs(current)->orig_ax = __NR_execve; current_thread_info()->status &= ~TS_COMPAT; if (current->mm) __set_bit(MM_CONTEXT_HAS_VSYSCALL, &current->mm->context.flags); /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that 32bit children are affected again. */ current->personality &= ~READ_IMPLIES_EXEC; } static void __set_personality_x32(void) { #ifdef CONFIG_X86_X32_ABI if (current->mm) current->mm->context.flags = 0; current->personality &= ~READ_IMPLIES_EXEC; /* * in_32bit_syscall() uses the presence of the x32 syscall bit * flag to determine compat status. The x86 mmap() code relies on * the syscall bitness so set x32 syscall bit right here to make * in_32bit_syscall() work during exec(). * * Pretend to come from a x32 execve. */ task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; current_thread_info()->status &= ~TS_COMPAT; #endif } static void __set_personality_ia32(void) { #ifdef CONFIG_IA32_EMULATION if (current->mm) { /* * uprobes applied to this MM need to know this and * cannot use user_64bit_mode() at that time. */ __set_bit(MM_CONTEXT_UPROBE_IA32, &current->mm->context.flags); } current->personality |= force_personality32; /* Prepare the first "return" to user space */ task_pt_regs(current)->orig_ax = __NR_ia32_execve; current_thread_info()->status |= TS_COMPAT; #endif } void set_personality_ia32(bool x32) { /* Make sure to be in 32bit mode */ set_thread_flag(TIF_ADDR32); if (x32) __set_personality_x32(); else __set_personality_ia32(); } EXPORT_SYMBOL_GPL(set_personality_ia32); #ifdef CONFIG_CHECKPOINT_RESTORE static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr) { int ret; ret = map_vdso_once(image, addr); if (ret) return ret; return (long)image->size; } #endif #ifdef CONFIG_ADDRESS_MASKING #define LAM_U57_BITS 6 static void enable_lam_func(void *__mm) { struct mm_struct *mm = __mm; unsigned long lam; if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) { lam = mm_lam_cr3_mask(mm); write_cr3(__read_cr3() | lam); cpu_tlbstate_update_lam(lam, mm_untag_mask(mm)); } } static void mm_enable_lam(struct mm_struct *mm) { mm->context.lam_cr3_mask = X86_CR3_LAM_U57; mm->context.untag_mask = ~GENMASK(62, 57); /* * Even though the process must still be single-threaded at this * point, kernel threads may be using the mm. IPI those kernel * threads if they exist. */ on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true); set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags); } static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits) { if (!cpu_feature_enabled(X86_FEATURE_LAM)) return -ENODEV; /* PTRACE_ARCH_PRCTL */ if (current->mm != mm) return -EINVAL; if (mm_valid_pasid(mm) && !test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags)) return -EINVAL; if (mmap_write_lock_killable(mm)) return -EINTR; /* * MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from * being enabled unless the process is single threaded: */ if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) { mmap_write_unlock(mm); return -EBUSY; } if (!nr_bits || nr_bits > LAM_U57_BITS) { mmap_write_unlock(mm); return -EINVAL; } mm_enable_lam(mm); mmap_write_unlock(mm); return 0; } #endif long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2) { int ret = 0; switch (option) { case ARCH_SET_GS: { if (unlikely(arg2 >= TASK_SIZE_MAX)) return -EPERM; preempt_disable(); /* * ARCH_SET_GS has always overwritten the index * and the base. Zero is the most sensible value * to put in the index, and is the only value that * makes any sense if FSGSBASE is unavailable. */ if (task == current) { loadseg(GS, 0); x86_gsbase_write_cpu_inactive(arg2); /* * On non-FSGSBASE systems, save_base_legacy() expects * that we also fill in thread.gsbase. */ task->thread.gsbase = arg2; } else { task->thread.gsindex = 0; x86_gsbase_write_task(task, arg2); } preempt_enable(); break; } case ARCH_SET_FS: { /* * Not strictly needed for %fs, but do it for symmetry * with %gs */ if (unlikely(arg2 >= TASK_SIZE_MAX)) return -EPERM; preempt_disable(); /* * Set the selector to 0 for the same reason * as %gs above. */ if (task == current) { loadseg(FS, 0); x86_fsbase_write_cpu(arg2); /* * On non-FSGSBASE systems, save_base_legacy() expects * that we also fill in thread.fsbase. */ task->thread.fsbase = arg2; } else { task->thread.fsindex = 0; x86_fsbase_write_task(task, arg2); } preempt_enable(); break; } case ARCH_GET_FS: { unsigned long base = x86_fsbase_read_task(task); ret = put_user(base, (unsigned long __user *)arg2); break; } case ARCH_GET_GS: { unsigned long base = x86_gsbase_read_task(task); ret = put_user(base, (unsigned long __user *)arg2); break; } #ifdef CONFIG_CHECKPOINT_RESTORE # ifdef CONFIG_X86_X32_ABI case ARCH_MAP_VDSO_X32: return prctl_map_vdso(&vdso_image_x32, arg2); # endif # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case ARCH_MAP_VDSO_32: return prctl_map_vdso(&vdso_image_32, arg2); # endif case ARCH_MAP_VDSO_64: return prctl_map_vdso(&vdso_image_64, arg2); #endif #ifdef CONFIG_ADDRESS_MASKING case ARCH_GET_UNTAG_MASK: return put_user(task->mm->context.untag_mask, (unsigned long __user *)arg2); case ARCH_ENABLE_TAGGED_ADDR: return prctl_enable_tagged_addr(task->mm, arg2); case ARCH_FORCE_TAGGED_SVA: if (current != task) return -EINVAL; set_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &task->mm->context.flags); return 0; case ARCH_GET_MAX_TAG_BITS: if (!cpu_feature_enabled(X86_FEATURE_LAM)) return put_user(0, (unsigned long __user *)arg2); else return put_user(LAM_U57_BITS, (unsigned long __user *)arg2); #endif case ARCH_SHSTK_ENABLE: case ARCH_SHSTK_DISABLE: case ARCH_SHSTK_LOCK: case ARCH_SHSTK_UNLOCK: case ARCH_SHSTK_STATUS: return shstk_prctl(task, option, arg2); default: ret = -EINVAL; break; } return ret; } SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { long ret; ret = do_arch_prctl_64(current, option, arg2); if (ret == -EINVAL) ret = do_arch_prctl_common(option, arg2); return ret; } #ifdef CONFIG_IA32_EMULATION COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2) { return do_arch_prctl_common(option, arg2); } #endif unsigned long KSTK_ESP(struct task_struct *task) { return task_pt_regs(task)->sp; }
17 17 10 10 6 7 4 3 7 13 4 2 5 1 1 2 1 2 19 6 21 13 13 6 2 1 4 3 10 13 13 1 1 8 4 5 8 2 1 4 2 2 2 2 2 27 27 1 2 24 1 2 20 20 8 22 28 1 27 6 1 2 3 3 2 1 1 8 2 1 6 1 4 6 2 6 5 6 6 6 2 2 2 6 2 1 4 1 3 3 2 1 1 1 4 1 2 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 // SPDX-License-Identifier: GPL-2.0-only /* * VMware VMCI Driver * * Copyright (C) 2012 VMware, Inc. All rights reserved. */ #include <linux/vmw_vmci_defs.h> #include <linux/vmw_vmci_api.h> #include <linux/highmem.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/pagemap.h> #include <linux/pci.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/uio.h> #include <linux/wait.h> #include <linux/vmalloc.h> #include <linux/skbuff.h> #include "vmci_handle_array.h" #include "vmci_queue_pair.h" #include "vmci_datagram.h" #include "vmci_resource.h" #include "vmci_context.h" #include "vmci_driver.h" #include "vmci_event.h" #include "vmci_route.h" /* * In the following, we will distinguish between two kinds of VMX processes - * the ones with versions lower than VMCI_VERSION_NOVMVM that use specialized * VMCI page files in the VMX and supporting VM to VM communication and the * newer ones that use the guest memory directly. We will in the following * refer to the older VMX versions as old-style VMX'en, and the newer ones as * new-style VMX'en. * * The state transition datagram is as follows (the VMCIQPB_ prefix has been * removed for readability) - see below for more details on the transtions: * * -------------- NEW ------------- * | | * \_/ \_/ * CREATED_NO_MEM <-----------------> CREATED_MEM * | | | * | o-----------------------o | * | | | * \_/ \_/ \_/ * ATTACHED_NO_MEM <----------------> ATTACHED_MEM * | | | * | o----------------------o | * | | | * \_/ \_/ \_/ * SHUTDOWN_NO_MEM <----------------> SHUTDOWN_MEM * | | * | | * -------------> gone <------------- * * In more detail. When a VMCI queue pair is first created, it will be in the * VMCIQPB_NEW state. It will then move into one of the following states: * * - VMCIQPB_CREATED_NO_MEM: this state indicates that either: * * - the created was performed by a host endpoint, in which case there is * no backing memory yet. * * - the create was initiated by an old-style VMX, that uses * vmci_qp_broker_set_page_store to specify the UVAs of the queue pair at * a later point in time. This state can be distinguished from the one * above by the context ID of the creator. A host side is not allowed to * attach until the page store has been set. * * - VMCIQPB_CREATED_MEM: this state is the result when the queue pair * is created by a VMX using the queue pair device backend that * sets the UVAs of the queue pair immediately and stores the * information for later attachers. At this point, it is ready for * the host side to attach to it. * * Once the queue pair is in one of the created states (with the exception of * the case mentioned for older VMX'en above), it is possible to attach to the * queue pair. Again we have two new states possible: * * - VMCIQPB_ATTACHED_MEM: this state can be reached through the following * paths: * * - from VMCIQPB_CREATED_NO_MEM when a new-style VMX allocates a queue * pair, and attaches to a queue pair previously created by the host side. * * - from VMCIQPB_CREATED_MEM when the host side attaches to a queue pair * already created by a guest. * * - from VMCIQPB_ATTACHED_NO_MEM, when an old-style VMX calls * vmci_qp_broker_set_page_store (see below). * * - VMCIQPB_ATTACHED_NO_MEM: If the queue pair already was in the * VMCIQPB_CREATED_NO_MEM due to a host side create, an old-style VMX will * bring the queue pair into this state. Once vmci_qp_broker_set_page_store * is called to register the user memory, the VMCIQPB_ATTACH_MEM state * will be entered. * * From the attached queue pair, the queue pair can enter the shutdown states * when either side of the queue pair detaches. If the guest side detaches * first, the queue pair will enter the VMCIQPB_SHUTDOWN_NO_MEM state, where * the content of the queue pair will no longer be available. If the host * side detaches first, the queue pair will either enter the * VMCIQPB_SHUTDOWN_MEM, if the guest memory is currently mapped, or * VMCIQPB_SHUTDOWN_NO_MEM, if the guest memory is not mapped * (e.g., the host detaches while a guest is stunned). * * New-style VMX'en will also unmap guest memory, if the guest is * quiesced, e.g., during a snapshot operation. In that case, the guest * memory will no longer be available, and the queue pair will transition from * *_MEM state to a *_NO_MEM state. The VMX may later map the memory once more, * in which case the queue pair will transition from the *_NO_MEM state at that * point back to the *_MEM state. Note that the *_NO_MEM state may have changed, * since the peer may have either attached or detached in the meantime. The * values are laid out such that ++ on a state will move from a *_NO_MEM to a * *_MEM state, and vice versa. */ /* The Kernel specific component of the struct vmci_queue structure. */ struct vmci_queue_kern_if { struct mutex __mutex; /* Protects the queue. */ struct mutex *mutex; /* Shared by producer and consumer queues. */ size_t num_pages; /* Number of pages incl. header. */ bool host; /* Host or guest? */ union { struct { dma_addr_t *pas; void **vas; } g; /* Used by the guest. */ struct { struct page **page; struct page **header_page; } h; /* Used by the host. */ } u; }; /* * This structure is opaque to the clients. */ struct vmci_qp { struct vmci_handle handle; struct vmci_queue *produce_q; struct vmci_queue *consume_q; u64 produce_q_size; u64 consume_q_size; u32 peer; u32 flags; u32 priv_flags; bool guest_endpoint; unsigned int blocked; unsigned int generation; wait_queue_head_t event; }; enum qp_broker_state { VMCIQPB_NEW, VMCIQPB_CREATED_NO_MEM, VMCIQPB_CREATED_MEM, VMCIQPB_ATTACHED_NO_MEM, VMCIQPB_ATTACHED_MEM, VMCIQPB_SHUTDOWN_NO_MEM, VMCIQPB_SHUTDOWN_MEM, VMCIQPB_GONE }; #define QPBROKERSTATE_HAS_MEM(_qpb) (_qpb->state == VMCIQPB_CREATED_MEM || \ _qpb->state == VMCIQPB_ATTACHED_MEM || \ _qpb->state == VMCIQPB_SHUTDOWN_MEM) /* * In the queue pair broker, we always use the guest point of view for * the produce and consume queue values and references, e.g., the * produce queue size stored is the guests produce queue size. The * host endpoint will need to swap these around. The only exception is * the local queue pairs on the host, in which case the host endpoint * that creates the queue pair will have the right orientation, and * the attaching host endpoint will need to swap. */ struct qp_entry { struct list_head list_item; struct vmci_handle handle; u32 peer; u32 flags; u64 produce_size; u64 consume_size; u32 ref_count; }; struct qp_broker_entry { struct vmci_resource resource; struct qp_entry qp; u32 create_id; u32 attach_id; enum qp_broker_state state; bool require_trusted_attach; bool created_by_trusted; bool vmci_page_files; /* Created by VMX using VMCI page files */ struct vmci_queue *produce_q; struct vmci_queue *consume_q; struct vmci_queue_header saved_produce_q; struct vmci_queue_header saved_consume_q; vmci_event_release_cb wakeup_cb; void *client_data; void *local_mem; /* Kernel memory for local queue pair */ }; struct qp_guest_endpoint { struct vmci_resource resource; struct qp_entry qp; u64 num_ppns; void *produce_q; void *consume_q; struct ppn_set ppn_set; }; struct qp_list { struct list_head head; struct mutex mutex; /* Protect queue list. */ }; static struct qp_list qp_broker_list = { .head = LIST_HEAD_INIT(qp_broker_list.head), .mutex = __MUTEX_INITIALIZER(qp_broker_list.mutex), }; static struct qp_list qp_guest_endpoints = { .head = LIST_HEAD_INIT(qp_guest_endpoints.head), .mutex = __MUTEX_INITIALIZER(qp_guest_endpoints.mutex), }; #define INVALID_VMCI_GUEST_MEM_ID 0 #define QPE_NUM_PAGES(_QPE) ((u32) \ (DIV_ROUND_UP(_QPE.produce_size, PAGE_SIZE) + \ DIV_ROUND_UP(_QPE.consume_size, PAGE_SIZE) + 2)) #define QP_SIZES_ARE_VALID(_prod_qsize, _cons_qsize) \ ((_prod_qsize) + (_cons_qsize) >= max(_prod_qsize, _cons_qsize) && \ (_prod_qsize) + (_cons_qsize) <= VMCI_MAX_GUEST_QP_MEMORY) /* * Frees kernel VA space for a given queue and its queue header, and * frees physical data pages. */ static void qp_free_queue(void *q, u64 size) { struct vmci_queue *queue = q; if (queue) { u64 i; /* Given size does not include header, so add in a page here. */ for (i = 0; i < DIV_ROUND_UP(size, PAGE_SIZE) + 1; i++) { dma_free_coherent(&vmci_pdev->dev, PAGE_SIZE, queue->kernel_if->u.g.vas[i], queue->kernel_if->u.g.pas[i]); } vfree(queue); } } /* * Allocates kernel queue pages of specified size with IOMMU mappings, * plus space for the queue structure/kernel interface and the queue * header. */ static void *qp_alloc_queue(u64 size, u32 flags) { u64 i; struct vmci_queue *queue; size_t pas_size; size_t vas_size; size_t queue_size = sizeof(*queue) + sizeof(*queue->kernel_if); u64 num_pages; if (size > SIZE_MAX - PAGE_SIZE) return NULL; num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1; if (num_pages > (SIZE_MAX - queue_size) / (sizeof(*queue->kernel_if->u.g.pas) + sizeof(*queue->kernel_if->u.g.vas))) return NULL; pas_size = num_pages * sizeof(*queue->kernel_if->u.g.pas); vas_size = num_pages * sizeof(*queue->kernel_if->u.g.vas); queue_size += pas_size + vas_size; queue = vmalloc(queue_size); if (!queue) return NULL; queue->q_header = NULL; queue->saved_header = NULL; queue->kernel_if = (struct vmci_queue_kern_if *)(queue + 1); queue->kernel_if->mutex = NULL; queue->kernel_if->num_pages = num_pages; queue->kernel_if->u.g.pas = (dma_addr_t *)(queue->kernel_if + 1); queue->kernel_if->u.g.vas = (void **)((u8 *)queue->kernel_if->u.g.pas + pas_size); queue->kernel_if->host = false; for (i = 0; i < num_pages; i++) { queue->kernel_if->u.g.vas[i] = dma_alloc_coherent(&vmci_pdev->dev, PAGE_SIZE, &queue->kernel_if->u.g.pas[i], GFP_KERNEL); if (!queue->kernel_if->u.g.vas[i]) { /* Size excl. the header. */ qp_free_queue(queue, i * PAGE_SIZE); return NULL; } } /* Queue header is the first page. */ queue->q_header = queue->kernel_if->u.g.vas[0]; return queue; } /* * Copies from a given buffer or iovector to a VMCI Queue. Uses * kmap_local_page() to dynamically map required portions of the queue * by traversing the offset -> page translation structure for the queue. * Assumes that offset + size does not wrap around in the queue. */ static int qp_memcpy_to_queue_iter(struct vmci_queue *queue, u64 queue_offset, struct iov_iter *from, size_t size) { struct vmci_queue_kern_if *kernel_if = queue->kernel_if; size_t bytes_copied = 0; while (bytes_copied < size) { const u64 page_index = (queue_offset + bytes_copied) / PAGE_SIZE; const size_t page_offset = (queue_offset + bytes_copied) & (PAGE_SIZE - 1); void *va; size_t to_copy; if (kernel_if->host) va = kmap_local_page(kernel_if->u.h.page[page_index]); else va = kernel_if->u.g.vas[page_index + 1]; /* Skip header. */ if (size - bytes_copied > PAGE_SIZE - page_offset) /* Enough payload to fill up from this page. */ to_copy = PAGE_SIZE - page_offset; else to_copy = size - bytes_copied; if (!copy_from_iter_full((u8 *)va + page_offset, to_copy, from)) { if (kernel_if->host) kunmap_local(va); return VMCI_ERROR_INVALID_ARGS; } bytes_copied += to_copy; if (kernel_if->host) kunmap_local(va); } return VMCI_SUCCESS; } /* * Copies to a given buffer or iovector from a VMCI Queue. Uses * kmap_local_page() to dynamically map required portions of the queue * by traversing the offset -> page translation structure for the queue. * Assumes that offset + size does not wrap around in the queue. */ static int qp_memcpy_from_queue_iter(struct iov_iter *to, const struct vmci_queue *queue, u64 queue_offset, size_t size) { struct vmci_queue_kern_if *kernel_if = queue->kernel_if; size_t bytes_copied = 0; while (bytes_copied < size) { const u64 page_index = (queue_offset + bytes_copied) / PAGE_SIZE; const size_t page_offset = (queue_offset + bytes_copied) & (PAGE_SIZE - 1); void *va; size_t to_copy; int err; if (kernel_if->host) va = kmap_local_page(kernel_if->u.h.page[page_index]); else va = kernel_if->u.g.vas[page_index + 1]; /* Skip header. */ if (size - bytes_copied > PAGE_SIZE - page_offset) /* Enough payload to fill up this page. */ to_copy = PAGE_SIZE - page_offset; else to_copy = size - bytes_copied; err = copy_to_iter((u8 *)va + page_offset, to_copy, to); if (err != to_copy) { if (kernel_if->host) kunmap_local(va); return VMCI_ERROR_INVALID_ARGS; } bytes_copied += to_copy; if (kernel_if->host) kunmap_local(va); } return VMCI_SUCCESS; } /* * Allocates two list of PPNs --- one for the pages in the produce queue, * and the other for the pages in the consume queue. Intializes the list * of PPNs with the page frame numbers of the KVA for the two queues (and * the queue headers). */ static int qp_alloc_ppn_set(void *prod_q, u64 num_produce_pages, void *cons_q, u64 num_consume_pages, struct ppn_set *ppn_set) { u64 *produce_ppns; u64 *consume_ppns; struct vmci_queue *produce_q = prod_q; struct vmci_queue *consume_q = cons_q; u64 i; if (!produce_q || !num_produce_pages || !consume_q || !num_consume_pages || !ppn_set) return VMCI_ERROR_INVALID_ARGS; if (ppn_set->initialized) return VMCI_ERROR_ALREADY_EXISTS; produce_ppns = kmalloc_array(num_produce_pages, sizeof(*produce_ppns), GFP_KERNEL); if (!produce_ppns) return VMCI_ERROR_NO_MEM; consume_ppns = kmalloc_array(num_consume_pages, sizeof(*consume_ppns), GFP_KERNEL); if (!consume_ppns) { kfree(produce_ppns); return VMCI_ERROR_NO_MEM; } for (i = 0; i < num_produce_pages; i++) produce_ppns[i] = produce_q->kernel_if->u.g.pas[i] >> PAGE_SHIFT; for (i = 0; i < num_consume_pages; i++) consume_ppns[i] = consume_q->kernel_if->u.g.pas[i] >> PAGE_SHIFT; ppn_set->num_produce_pages = num_produce_pages; ppn_set->num_consume_pages = num_consume_pages; ppn_set->produce_ppns = produce_ppns; ppn_set->consume_ppns = consume_ppns; ppn_set->initialized = true; return VMCI_SUCCESS; } /* * Frees the two list of PPNs for a queue pair. */ static void qp_free_ppn_set(struct ppn_set *ppn_set) { if (ppn_set->initialized) { /* Do not call these functions on NULL inputs. */ kfree(ppn_set->produce_ppns); kfree(ppn_set->consume_ppns); } memset(ppn_set, 0, sizeof(*ppn_set)); } /* * Populates the list of PPNs in the hypercall structure with the PPNS * of the produce queue and the consume queue. */ static int qp_populate_ppn_set(u8 *call_buf, const struct ppn_set *ppn_set) { if (vmci_use_ppn64()) { memcpy(call_buf, ppn_set->produce_ppns, ppn_set->num_produce_pages * sizeof(*ppn_set->produce_ppns)); memcpy(call_buf + ppn_set->num_produce_pages * sizeof(*ppn_set->produce_ppns), ppn_set->consume_ppns, ppn_set->num_consume_pages * sizeof(*ppn_set->consume_ppns)); } else { int i; u32 *ppns = (u32 *) call_buf; for (i = 0; i < ppn_set->num_produce_pages; i++) ppns[i] = (u32) ppn_set->produce_ppns[i]; ppns = &ppns[ppn_set->num_produce_pages]; for (i = 0; i < ppn_set->num_consume_pages; i++) ppns[i] = (u32) ppn_set->consume_ppns[i]; } return VMCI_SUCCESS; } /* * Allocates kernel VA space of specified size plus space for the queue * and kernel interface. This is different from the guest queue allocator, * because we do not allocate our own queue header/data pages here but * share those of the guest. */ static struct vmci_queue *qp_host_alloc_queue(u64 size) { struct vmci_queue *queue; size_t queue_page_size; u64 num_pages; const size_t queue_size = sizeof(*queue) + sizeof(*(queue->kernel_if)); if (size > min_t(size_t, VMCI_MAX_GUEST_QP_MEMORY, SIZE_MAX - PAGE_SIZE)) return NULL; num_pages = DIV_ROUND_UP(size, PAGE_SIZE) + 1; if (num_pages > (SIZE_MAX - queue_size) / sizeof(*queue->kernel_if->u.h.page)) return NULL; queue_page_size = num_pages * sizeof(*queue->kernel_if->u.h.page); if (queue_size + queue_page_size > KMALLOC_MAX_SIZE) return NULL; queue = kzalloc(queue_size + queue_page_size, GFP_KERNEL); if (queue) { queue->q_header = NULL; queue->saved_header = NULL; queue->kernel_if = (struct vmci_queue_kern_if *)(queue + 1); queue->kernel_if->host = true; queue->kernel_if->mutex = NULL; queue->kernel_if->num_pages = num_pages; queue->kernel_if->u.h.header_page = (struct page **)((u8 *)queue + queue_size); queue->kernel_if->u.h.page = &queue->kernel_if->u.h.header_page[1]; } return queue; } /* * Frees kernel memory for a given queue (header plus translation * structure). */ static void qp_host_free_queue(struct vmci_queue *queue, u64 queue_size) { kfree(queue); } /* * Initialize the mutex for the pair of queues. This mutex is used to * protect the q_header and the buffer from changing out from under any * users of either queue. Of course, it's only any good if the mutexes * are actually acquired. Queue structure must lie on non-paged memory * or we cannot guarantee access to the mutex. */ static void qp_init_queue_mutex(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { /* * Only the host queue has shared state - the guest queues do not * need to synchronize access using a queue mutex. */ if (produce_q->kernel_if->host) { produce_q->kernel_if->mutex = &produce_q->kernel_if->__mutex; consume_q->kernel_if->mutex = &produce_q->kernel_if->__mutex; mutex_init(produce_q->kernel_if->mutex); } } /* * Cleans up the mutex for the pair of queues. */ static void qp_cleanup_queue_mutex(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { if (produce_q->kernel_if->host) { produce_q->kernel_if->mutex = NULL; consume_q->kernel_if->mutex = NULL; } } /* * Acquire the mutex for the queue. Note that the produce_q and * the consume_q share a mutex. So, only one of the two need to * be passed in to this routine. Either will work just fine. */ static void qp_acquire_queue_mutex(struct vmci_queue *queue) { if (queue->kernel_if->host) mutex_lock(queue->kernel_if->mutex); } /* * Release the mutex for the queue. Note that the produce_q and * the consume_q share a mutex. So, only one of the two need to * be passed in to this routine. Either will work just fine. */ static void qp_release_queue_mutex(struct vmci_queue *queue) { if (queue->kernel_if->host) mutex_unlock(queue->kernel_if->mutex); } /* * Helper function to release pages in the PageStoreAttachInfo * previously obtained using get_user_pages. */ static void qp_release_pages(struct page **pages, u64 num_pages, bool dirty) { int i; for (i = 0; i < num_pages; i++) { if (dirty) set_page_dirty_lock(pages[i]); put_page(pages[i]); pages[i] = NULL; } } /* * Lock the user pages referenced by the {produce,consume}Buffer * struct into memory and populate the {produce,consume}Pages * arrays in the attach structure with them. */ static int qp_host_get_user_memory(u64 produce_uva, u64 consume_uva, struct vmci_queue *produce_q, struct vmci_queue *consume_q) { int retval; int err = VMCI_SUCCESS; retval = get_user_pages_fast((uintptr_t) produce_uva, produce_q->kernel_if->num_pages, FOLL_WRITE, produce_q->kernel_if->u.h.header_page); if (retval < (int)produce_q->kernel_if->num_pages) { pr_debug("get_user_pages_fast(produce) failed (retval=%d)", retval); if (retval > 0) qp_release_pages(produce_q->kernel_if->u.h.header_page, retval, false); err = VMCI_ERROR_NO_MEM; goto out; } retval = get_user_pages_fast((uintptr_t) consume_uva, consume_q->kernel_if->num_pages, FOLL_WRITE, consume_q->kernel_if->u.h.header_page); if (retval < (int)consume_q->kernel_if->num_pages) { pr_debug("get_user_pages_fast(consume) failed (retval=%d)", retval); if (retval > 0) qp_release_pages(consume_q->kernel_if->u.h.header_page, retval, false); qp_release_pages(produce_q->kernel_if->u.h.header_page, produce_q->kernel_if->num_pages, false); err = VMCI_ERROR_NO_MEM; } out: return err; } /* * Registers the specification of the user pages used for backing a queue * pair. Enough information to map in pages is stored in the OS specific * part of the struct vmci_queue structure. */ static int qp_host_register_user_memory(struct vmci_qp_page_store *page_store, struct vmci_queue *produce_q, struct vmci_queue *consume_q) { u64 produce_uva; u64 consume_uva; /* * The new style and the old style mapping only differs in * that we either get a single or two UVAs, so we split the * single UVA range at the appropriate spot. */ produce_uva = page_store->pages; consume_uva = page_store->pages + produce_q->kernel_if->num_pages * PAGE_SIZE; return qp_host_get_user_memory(produce_uva, consume_uva, produce_q, consume_q); } /* * Releases and removes the references to user pages stored in the attach * struct. Pages are released from the page cache and may become * swappable again. */ static void qp_host_unregister_user_memory(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { qp_release_pages(produce_q->kernel_if->u.h.header_page, produce_q->kernel_if->num_pages, true); memset(produce_q->kernel_if->u.h.header_page, 0, sizeof(*produce_q->kernel_if->u.h.header_page) * produce_q->kernel_if->num_pages); qp_release_pages(consume_q->kernel_if->u.h.header_page, consume_q->kernel_if->num_pages, true); memset(consume_q->kernel_if->u.h.header_page, 0, sizeof(*consume_q->kernel_if->u.h.header_page) * consume_q->kernel_if->num_pages); } /* * Once qp_host_register_user_memory has been performed on a * queue, the queue pair headers can be mapped into the * kernel. Once mapped, they must be unmapped with * qp_host_unmap_queues prior to calling * qp_host_unregister_user_memory. * Pages are pinned. */ static int qp_host_map_queues(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { int result; if (!produce_q->q_header || !consume_q->q_header) { struct page *headers[2]; if (produce_q->q_header != consume_q->q_header) return VMCI_ERROR_QUEUEPAIR_MISMATCH; if (produce_q->kernel_if->u.h.header_page == NULL || *produce_q->kernel_if->u.h.header_page == NULL) return VMCI_ERROR_UNAVAILABLE; headers[0] = *produce_q->kernel_if->u.h.header_page; headers[1] = *consume_q->kernel_if->u.h.header_page; produce_q->q_header = vmap(headers, 2, VM_MAP, PAGE_KERNEL); if (produce_q->q_header != NULL) { consume_q->q_header = (struct vmci_queue_header *)((u8 *) produce_q->q_header + PAGE_SIZE); result = VMCI_SUCCESS; } else { pr_warn("vmap failed\n"); result = VMCI_ERROR_NO_MEM; } } else { result = VMCI_SUCCESS; } return result; } /* * Unmaps previously mapped queue pair headers from the kernel. * Pages are unpinned. */ static int qp_host_unmap_queues(u32 gid, struct vmci_queue *produce_q, struct vmci_queue *consume_q) { if (produce_q->q_header) { if (produce_q->q_header < consume_q->q_header) vunmap(produce_q->q_header); else vunmap(consume_q->q_header); produce_q->q_header = NULL; consume_q->q_header = NULL; } return VMCI_SUCCESS; } /* * Finds the entry in the list corresponding to a given handle. Assumes * that the list is locked. */ static struct qp_entry *qp_list_find(struct qp_list *qp_list, struct vmci_handle handle) { struct qp_entry *entry; if (vmci_handle_is_invalid(handle)) return NULL; list_for_each_entry(entry, &qp_list->head, list_item) { if (vmci_handle_is_equal(entry->handle, handle)) return entry; } return NULL; } /* * Finds the entry in the list corresponding to a given handle. */ static struct qp_guest_endpoint * qp_guest_handle_to_entry(struct vmci_handle handle) { struct qp_guest_endpoint *entry; struct qp_entry *qp = qp_list_find(&qp_guest_endpoints, handle); entry = qp ? container_of( qp, struct qp_guest_endpoint, qp) : NULL; return entry; } /* * Finds the entry in the list corresponding to a given handle. */ static struct qp_broker_entry * qp_broker_handle_to_entry(struct vmci_handle handle) { struct qp_broker_entry *entry; struct qp_entry *qp = qp_list_find(&qp_broker_list, handle); entry = qp ? container_of( qp, struct qp_broker_entry, qp) : NULL; return entry; } /* * Dispatches a queue pair event message directly into the local event * queue. */ static int qp_notify_peer_local(bool attach, struct vmci_handle handle) { u32 context_id = vmci_get_context_id(); struct vmci_event_qp ev; memset(&ev, 0, sizeof(ev)); ev.msg.hdr.dst = vmci_make_handle(context_id, VMCI_EVENT_HANDLER); ev.msg.hdr.src = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_CONTEXT_RESOURCE_ID); ev.msg.hdr.payload_size = sizeof(ev) - sizeof(ev.msg.hdr); ev.msg.event_data.event = attach ? VMCI_EVENT_QP_PEER_ATTACH : VMCI_EVENT_QP_PEER_DETACH; ev.payload.peer_id = context_id; ev.payload.handle = handle; return vmci_event_dispatch(&ev.msg.hdr); } /* * Allocates and initializes a qp_guest_endpoint structure. * Allocates a queue_pair rid (and handle) iff the given entry has * an invalid handle. 0 through VMCI_RESERVED_RESOURCE_ID_MAX * are reserved handles. Assumes that the QP list mutex is held * by the caller. */ static struct qp_guest_endpoint * qp_guest_endpoint_create(struct vmci_handle handle, u32 peer, u32 flags, u64 produce_size, u64 consume_size, void *produce_q, void *consume_q) { int result; struct qp_guest_endpoint *entry; /* One page each for the queue headers. */ const u64 num_ppns = DIV_ROUND_UP(produce_size, PAGE_SIZE) + DIV_ROUND_UP(consume_size, PAGE_SIZE) + 2; if (vmci_handle_is_invalid(handle)) { u32 context_id = vmci_get_context_id(); handle = vmci_make_handle(context_id, VMCI_INVALID_ID); } entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry) { entry->qp.peer = peer; entry->qp.flags = flags; entry->qp.produce_size = produce_size; entry->qp.consume_size = consume_size; entry->qp.ref_count = 0; entry->num_ppns = num_ppns; entry->produce_q = produce_q; entry->consume_q = consume_q; INIT_LIST_HEAD(&entry->qp.list_item); /* Add resource obj */ result = vmci_resource_add(&entry->resource, VMCI_RESOURCE_TYPE_QPAIR_GUEST, handle); entry->qp.handle = vmci_resource_handle(&entry->resource); if ((result != VMCI_SUCCESS) || qp_list_find(&qp_guest_endpoints, entry->qp.handle)) { pr_warn("Failed to add new resource (handle=0x%x:0x%x), error: %d", handle.context, handle.resource, result); kfree(entry); entry = NULL; } } return entry; } /* * Frees a qp_guest_endpoint structure. */ static void qp_guest_endpoint_destroy(struct qp_guest_endpoint *entry) { qp_free_ppn_set(&entry->ppn_set); qp_cleanup_queue_mutex(entry->produce_q, entry->consume_q); qp_free_queue(entry->produce_q, entry->qp.produce_size); qp_free_queue(entry->consume_q, entry->qp.consume_size); /* Unlink from resource hash table and free callback */ vmci_resource_remove(&entry->resource); kfree(entry); } /* * Helper to make a queue_pairAlloc hypercall when the driver is * supporting a guest device. */ static int qp_alloc_hypercall(const struct qp_guest_endpoint *entry) { struct vmci_qp_alloc_msg *alloc_msg; size_t msg_size; size_t ppn_size; int result; if (!entry || entry->num_ppns <= 2) return VMCI_ERROR_INVALID_ARGS; ppn_size = vmci_use_ppn64() ? sizeof(u64) : sizeof(u32); msg_size = sizeof(*alloc_msg) + (size_t) entry->num_ppns * ppn_size; alloc_msg = kmalloc(msg_size, GFP_KERNEL); if (!alloc_msg) return VMCI_ERROR_NO_MEM; alloc_msg->hdr.dst = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_QUEUEPAIR_ALLOC); alloc_msg->hdr.src = VMCI_ANON_SRC_HANDLE; alloc_msg->hdr.payload_size = msg_size - VMCI_DG_HEADERSIZE; alloc_msg->handle = entry->qp.handle; alloc_msg->peer = entry->qp.peer; alloc_msg->flags = entry->qp.flags; alloc_msg->produce_size = entry->qp.produce_size; alloc_msg->consume_size = entry->qp.consume_size; alloc_msg->num_ppns = entry->num_ppns; result = qp_populate_ppn_set((u8 *)alloc_msg + sizeof(*alloc_msg), &entry->ppn_set); if (result == VMCI_SUCCESS) result = vmci_send_datagram(&alloc_msg->hdr); kfree(alloc_msg); return result; } /* * Helper to make a queue_pairDetach hypercall when the driver is * supporting a guest device. */ static int qp_detatch_hypercall(struct vmci_handle handle) { struct vmci_qp_detach_msg detach_msg; detach_msg.hdr.dst = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_QUEUEPAIR_DETACH); detach_msg.hdr.src = VMCI_ANON_SRC_HANDLE; detach_msg.hdr.payload_size = sizeof(handle); detach_msg.handle = handle; return vmci_send_datagram(&detach_msg.hdr); } /* * Adds the given entry to the list. Assumes that the list is locked. */ static void qp_list_add_entry(struct qp_list *qp_list, struct qp_entry *entry) { if (entry) list_add(&entry->list_item, &qp_list->head); } /* * Removes the given entry from the list. Assumes that the list is locked. */ static void qp_list_remove_entry(struct qp_list *qp_list, struct qp_entry *entry) { if (entry) list_del(&entry->list_item); } /* * Helper for VMCI queue_pair detach interface. Frees the physical * pages for the queue pair. */ static int qp_detatch_guest_work(struct vmci_handle handle) { int result; struct qp_guest_endpoint *entry; u32 ref_count = ~0; /* To avoid compiler warning below */ mutex_lock(&qp_guest_endpoints.mutex); entry = qp_guest_handle_to_entry(handle); if (!entry) { mutex_unlock(&qp_guest_endpoints.mutex); return VMCI_ERROR_NOT_FOUND; } if (entry->qp.flags & VMCI_QPFLAG_LOCAL) { result = VMCI_SUCCESS; if (entry->qp.ref_count > 1) { result = qp_notify_peer_local(false, handle); /* * We can fail to notify a local queuepair * because we can't allocate. We still want * to release the entry if that happens, so * don't bail out yet. */ } } else { result = qp_detatch_hypercall(handle); if (result < VMCI_SUCCESS) { /* * We failed to notify a non-local queuepair. * That other queuepair might still be * accessing the shared memory, so don't * release the entry yet. It will get cleaned * up by VMCIqueue_pair_Exit() if necessary * (assuming we are going away, otherwise why * did this fail?). */ mutex_unlock(&qp_guest_endpoints.mutex); return result; } } /* * If we get here then we either failed to notify a local queuepair, or * we succeeded in all cases. Release the entry if required. */ entry->qp.ref_count--; if (entry->qp.ref_count == 0) qp_list_remove_entry(&qp_guest_endpoints, &entry->qp); /* If we didn't remove the entry, this could change once we unlock. */ if (entry) ref_count = entry->qp.ref_count; mutex_unlock(&qp_guest_endpoints.mutex); if (ref_count == 0) qp_guest_endpoint_destroy(entry); return result; } /* * This functions handles the actual allocation of a VMCI queue * pair guest endpoint. Allocates physical pages for the queue * pair. It makes OS dependent calls through generic wrappers. */ static int qp_alloc_guest_work(struct vmci_handle *handle, struct vmci_queue **produce_q, u64 produce_size, struct vmci_queue **consume_q, u64 consume_size, u32 peer, u32 flags, u32 priv_flags) { const u64 num_produce_pages = DIV_ROUND_UP(produce_size, PAGE_SIZE) + 1; const u64 num_consume_pages = DIV_ROUND_UP(consume_size, PAGE_SIZE) + 1; void *my_produce_q = NULL; void *my_consume_q = NULL; int result; struct qp_guest_endpoint *queue_pair_entry = NULL; if (priv_flags != VMCI_NO_PRIVILEGE_FLAGS) return VMCI_ERROR_NO_ACCESS; mutex_lock(&qp_guest_endpoints.mutex); queue_pair_entry = qp_guest_handle_to_entry(*handle); if (queue_pair_entry) { if (queue_pair_entry->qp.flags & VMCI_QPFLAG_LOCAL) { /* Local attach case. */ if (queue_pair_entry->qp.ref_count > 1) { pr_devel("Error attempting to attach more than once\n"); result = VMCI_ERROR_UNAVAILABLE; goto error_keep_entry; } if (queue_pair_entry->qp.produce_size != consume_size || queue_pair_entry->qp.consume_size != produce_size || queue_pair_entry->qp.flags != (flags & ~VMCI_QPFLAG_ATTACH_ONLY)) { pr_devel("Error mismatched queue pair in local attach\n"); result = VMCI_ERROR_QUEUEPAIR_MISMATCH; goto error_keep_entry; } /* * Do a local attach. We swap the consume and * produce queues for the attacher and deliver * an attach event. */ result = qp_notify_peer_local(true, *handle); if (result < VMCI_SUCCESS) goto error_keep_entry; my_produce_q = queue_pair_entry->consume_q; my_consume_q = queue_pair_entry->produce_q; goto out; } result = VMCI_ERROR_ALREADY_EXISTS; goto error_keep_entry; } my_produce_q = qp_alloc_queue(produce_size, flags); if (!my_produce_q) { pr_warn("Error allocating pages for produce queue\n"); result = VMCI_ERROR_NO_MEM; goto error; } my_consume_q = qp_alloc_queue(consume_size, flags); if (!my_consume_q) { pr_warn("Error allocating pages for consume queue\n"); result = VMCI_ERROR_NO_MEM; goto error; } queue_pair_entry = qp_guest_endpoint_create(*handle, peer, flags, produce_size, consume_size, my_produce_q, my_consume_q); if (!queue_pair_entry) { pr_warn("Error allocating memory in %s\n", __func__); result = VMCI_ERROR_NO_MEM; goto error; } result = qp_alloc_ppn_set(my_produce_q, num_produce_pages, my_consume_q, num_consume_pages, &queue_pair_entry->ppn_set); if (result < VMCI_SUCCESS) { pr_warn("qp_alloc_ppn_set failed\n"); goto error; } /* * It's only necessary to notify the host if this queue pair will be * attached to from another context. */ if (queue_pair_entry->qp.flags & VMCI_QPFLAG_LOCAL) { /* Local create case. */ u32 context_id = vmci_get_context_id(); /* * Enforce similar checks on local queue pairs as we * do for regular ones. The handle's context must * match the creator or attacher context id (here they * are both the current context id) and the * attach-only flag cannot exist during create. We * also ensure specified peer is this context or an * invalid one. */ if (queue_pair_entry->qp.handle.context != context_id || (queue_pair_entry->qp.peer != VMCI_INVALID_ID && queue_pair_entry->qp.peer != context_id)) { result = VMCI_ERROR_NO_ACCESS; goto error; } if (queue_pair_entry->qp.flags & VMCI_QPFLAG_ATTACH_ONLY) { result = VMCI_ERROR_NOT_FOUND; goto error; } } else { result = qp_alloc_hypercall(queue_pair_entry); if (result < VMCI_SUCCESS) { pr_devel("qp_alloc_hypercall result = %d\n", result); goto error; } } qp_init_queue_mutex((struct vmci_queue *)my_produce_q, (struct vmci_queue *)my_consume_q); qp_list_add_entry(&qp_guest_endpoints, &queue_pair_entry->qp); out: queue_pair_entry->qp.ref_count++; *handle = queue_pair_entry->qp.handle; *produce_q = (struct vmci_queue *)my_produce_q; *consume_q = (struct vmci_queue *)my_consume_q; /* * We should initialize the queue pair header pages on a local * queue pair create. For non-local queue pairs, the * hypervisor initializes the header pages in the create step. */ if ((queue_pair_entry->qp.flags & VMCI_QPFLAG_LOCAL) && queue_pair_entry->qp.ref_count == 1) { vmci_q_header_init((*produce_q)->q_header, *handle); vmci_q_header_init((*consume_q)->q_header, *handle); } mutex_unlock(&qp_guest_endpoints.mutex); return VMCI_SUCCESS; error: mutex_unlock(&qp_guest_endpoints.mutex); if (queue_pair_entry) { /* The queues will be freed inside the destroy routine. */ qp_guest_endpoint_destroy(queue_pair_entry); } else { qp_free_queue(my_produce_q, produce_size); qp_free_queue(my_consume_q, consume_size); } return result; error_keep_entry: /* This path should only be used when an existing entry was found. */ mutex_unlock(&qp_guest_endpoints.mutex); return result; } /* * The first endpoint issuing a queue pair allocation will create the state * of the queue pair in the queue pair broker. * * If the creator is a guest, it will associate a VMX virtual address range * with the queue pair as specified by the page_store. For compatibility with * older VMX'en, that would use a separate step to set the VMX virtual * address range, the virtual address range can be registered later using * vmci_qp_broker_set_page_store. In that case, a page_store of NULL should be * used. * * If the creator is the host, a page_store of NULL should be used as well, * since the host is not able to supply a page store for the queue pair. * * For older VMX and host callers, the queue pair will be created in the * VMCIQPB_CREATED_NO_MEM state, and for current VMX callers, it will be * created in VMCOQPB_CREATED_MEM state. */ static int qp_broker_create(struct vmci_handle handle, u32 peer, u32 flags, u32 priv_flags, u64 produce_size, u64 consume_size, struct vmci_qp_page_store *page_store, struct vmci_ctx *context, vmci_event_release_cb wakeup_cb, void *client_data, struct qp_broker_entry **ent) { struct qp_broker_entry *entry = NULL; const u32 context_id = vmci_ctx_get_id(context); bool is_local = flags & VMCI_QPFLAG_LOCAL; int result; u64 guest_produce_size; u64 guest_consume_size; /* Do not create if the caller asked not to. */ if (flags & VMCI_QPFLAG_ATTACH_ONLY) return VMCI_ERROR_NOT_FOUND; /* * Creator's context ID should match handle's context ID or the creator * must allow the context in handle's context ID as the "peer". */ if (handle.context != context_id && handle.context != peer) return VMCI_ERROR_NO_ACCESS; if (VMCI_CONTEXT_IS_VM(context_id) && VMCI_CONTEXT_IS_VM(peer)) return VMCI_ERROR_DST_UNREACHABLE; /* * Creator's context ID for local queue pairs should match the * peer, if a peer is specified. */ if (is_local && peer != VMCI_INVALID_ID && context_id != peer) return VMCI_ERROR_NO_ACCESS; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return VMCI_ERROR_NO_MEM; if (vmci_ctx_get_id(context) == VMCI_HOST_CONTEXT_ID && !is_local) { /* * The queue pair broker entry stores values from the guest * point of view, so a creating host side endpoint should swap * produce and consume values -- unless it is a local queue * pair, in which case no swapping is necessary, since the local * attacher will swap queues. */ guest_produce_size = consume_size; guest_consume_size = produce_size; } else { guest_produce_size = produce_size; guest_consume_size = consume_size; } entry->qp.handle = handle; entry->qp.peer = peer; entry->qp.flags = flags; entry->qp.produce_size = guest_produce_size; entry->qp.consume_size = guest_consume_size; entry->qp.ref_count = 1; entry->create_id = context_id; entry->attach_id = VMCI_INVALID_ID; entry->state = VMCIQPB_NEW; entry->require_trusted_attach = !!(context->priv_flags & VMCI_PRIVILEGE_FLAG_RESTRICTED); entry->created_by_trusted = !!(priv_flags & VMCI_PRIVILEGE_FLAG_TRUSTED); entry->vmci_page_files = false; entry->wakeup_cb = wakeup_cb; entry->client_data = client_data; entry->produce_q = qp_host_alloc_queue(guest_produce_size); if (entry->produce_q == NULL) { result = VMCI_ERROR_NO_MEM; goto error; } entry->consume_q = qp_host_alloc_queue(guest_consume_size); if (entry->consume_q == NULL) { result = VMCI_ERROR_NO_MEM; goto error; } qp_init_queue_mutex(entry->produce_q, entry->consume_q); INIT_LIST_HEAD(&entry->qp.list_item); if (is_local) { u8 *tmp; entry->local_mem = kcalloc(QPE_NUM_PAGES(entry->qp), PAGE_SIZE, GFP_KERNEL); if (entry->local_mem == NULL) { result = VMCI_ERROR_NO_MEM; goto error; } entry->state = VMCIQPB_CREATED_MEM; entry->produce_q->q_header = entry->local_mem; tmp = (u8 *)entry->local_mem + PAGE_SIZE * (DIV_ROUND_UP(entry->qp.produce_size, PAGE_SIZE) + 1); entry->consume_q->q_header = (struct vmci_queue_header *)tmp; } else if (page_store) { /* * The VMX already initialized the queue pair headers, so no * need for the kernel side to do that. */ result = qp_host_register_user_memory(page_store, entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) goto error; entry->state = VMCIQPB_CREATED_MEM; } else { /* * A create without a page_store may be either a host * side create (in which case we are waiting for the * guest side to supply the memory) or an old style * queue pair create (in which case we will expect a * set page store call as the next step). */ entry->state = VMCIQPB_CREATED_NO_MEM; } qp_list_add_entry(&qp_broker_list, &entry->qp); if (ent != NULL) *ent = entry; /* Add to resource obj */ result = vmci_resource_add(&entry->resource, VMCI_RESOURCE_TYPE_QPAIR_HOST, handle); if (result != VMCI_SUCCESS) { pr_warn("Failed to add new resource (handle=0x%x:0x%x), error: %d", handle.context, handle.resource, result); goto error; } entry->qp.handle = vmci_resource_handle(&entry->resource); if (is_local) { vmci_q_header_init(entry->produce_q->q_header, entry->qp.handle); vmci_q_header_init(entry->consume_q->q_header, entry->qp.handle); } vmci_ctx_qp_create(context, entry->qp.handle); return VMCI_SUCCESS; error: if (entry != NULL) { qp_host_free_queue(entry->produce_q, guest_produce_size); qp_host_free_queue(entry->consume_q, guest_consume_size); kfree(entry); } return result; } /* * Enqueues an event datagram to notify the peer VM attached to * the given queue pair handle about attach/detach event by the * given VM. Returns Payload size of datagram enqueued on * success, error code otherwise. */ static int qp_notify_peer(bool attach, struct vmci_handle handle, u32 my_id, u32 peer_id) { int rv; struct vmci_event_qp ev; if (vmci_handle_is_invalid(handle) || my_id == VMCI_INVALID_ID || peer_id == VMCI_INVALID_ID) return VMCI_ERROR_INVALID_ARGS; /* * In vmci_ctx_enqueue_datagram() we enforce the upper limit on * number of pending events from the hypervisor to a given VM * otherwise a rogue VM could do an arbitrary number of attach * and detach operations causing memory pressure in the host * kernel. */ memset(&ev, 0, sizeof(ev)); ev.msg.hdr.dst = vmci_make_handle(peer_id, VMCI_EVENT_HANDLER); ev.msg.hdr.src = vmci_make_handle(VMCI_HYPERVISOR_CONTEXT_ID, VMCI_CONTEXT_RESOURCE_ID); ev.msg.hdr.payload_size = sizeof(ev) - sizeof(ev.msg.hdr); ev.msg.event_data.event = attach ? VMCI_EVENT_QP_PEER_ATTACH : VMCI_EVENT_QP_PEER_DETACH; ev.payload.handle = handle; ev.payload.peer_id = my_id; rv = vmci_datagram_dispatch(VMCI_HYPERVISOR_CONTEXT_ID, &ev.msg.hdr, false); if (rv < VMCI_SUCCESS) pr_warn("Failed to enqueue queue_pair %s event datagram for context (ID=0x%x)\n", attach ? "ATTACH" : "DETACH", peer_id); return rv; } /* * The second endpoint issuing a queue pair allocation will attach to * the queue pair registered with the queue pair broker. * * If the attacher is a guest, it will associate a VMX virtual address * range with the queue pair as specified by the page_store. At this * point, the already attach host endpoint may start using the queue * pair, and an attach event is sent to it. For compatibility with * older VMX'en, that used a separate step to set the VMX virtual * address range, the virtual address range can be registered later * using vmci_qp_broker_set_page_store. In that case, a page_store of * NULL should be used, and the attach event will be generated once * the actual page store has been set. * * If the attacher is the host, a page_store of NULL should be used as * well, since the page store information is already set by the guest. * * For new VMX and host callers, the queue pair will be moved to the * VMCIQPB_ATTACHED_MEM state, and for older VMX callers, it will be * moved to the VMCOQPB_ATTACHED_NO_MEM state. */ static int qp_broker_attach(struct qp_broker_entry *entry, u32 peer, u32 flags, u32 priv_flags, u64 produce_size, u64 consume_size, struct vmci_qp_page_store *page_store, struct vmci_ctx *context, vmci_event_release_cb wakeup_cb, void *client_data, struct qp_broker_entry **ent) { const u32 context_id = vmci_ctx_get_id(context); bool is_local = flags & VMCI_QPFLAG_LOCAL; int result; if (entry->state != VMCIQPB_CREATED_NO_MEM && entry->state != VMCIQPB_CREATED_MEM) return VMCI_ERROR_UNAVAILABLE; if (is_local) { if (!(entry->qp.flags & VMCI_QPFLAG_LOCAL) || context_id != entry->create_id) { return VMCI_ERROR_INVALID_ARGS; } } else if (context_id == entry->create_id || context_id == entry->attach_id) { return VMCI_ERROR_ALREADY_EXISTS; } if (VMCI_CONTEXT_IS_VM(context_id) && VMCI_CONTEXT_IS_VM(entry->create_id)) return VMCI_ERROR_DST_UNREACHABLE; /* * If we are attaching from a restricted context then the queuepair * must have been created by a trusted endpoint. */ if ((context->priv_flags & VMCI_PRIVILEGE_FLAG_RESTRICTED) && !entry->created_by_trusted) return VMCI_ERROR_NO_ACCESS; /* * If we are attaching to a queuepair that was created by a restricted * context then we must be trusted. */ if (entry->require_trusted_attach && (!(priv_flags & VMCI_PRIVILEGE_FLAG_TRUSTED))) return VMCI_ERROR_NO_ACCESS; /* * If the creator specifies VMCI_INVALID_ID in "peer" field, access * control check is not performed. */ if (entry->qp.peer != VMCI_INVALID_ID && entry->qp.peer != context_id) return VMCI_ERROR_NO_ACCESS; if (entry->create_id == VMCI_HOST_CONTEXT_ID) { /* * Do not attach if the caller doesn't support Host Queue Pairs * and a host created this queue pair. */ if (!vmci_ctx_supports_host_qp(context)) return VMCI_ERROR_INVALID_RESOURCE; } else if (context_id == VMCI_HOST_CONTEXT_ID) { struct vmci_ctx *create_context; bool supports_host_qp; /* * Do not attach a host to a user created queue pair if that * user doesn't support host queue pair end points. */ create_context = vmci_ctx_get(entry->create_id); supports_host_qp = vmci_ctx_supports_host_qp(create_context); vmci_ctx_put(create_context); if (!supports_host_qp) return VMCI_ERROR_INVALID_RESOURCE; } if ((entry->qp.flags & ~VMCI_QP_ASYMM) != (flags & ~VMCI_QP_ASYMM_PEER)) return VMCI_ERROR_QUEUEPAIR_MISMATCH; if (context_id != VMCI_HOST_CONTEXT_ID) { /* * The queue pair broker entry stores values from the guest * point of view, so an attaching guest should match the values * stored in the entry. */ if (entry->qp.produce_size != produce_size || entry->qp.consume_size != consume_size) { return VMCI_ERROR_QUEUEPAIR_MISMATCH; } } else if (entry->qp.produce_size != consume_size || entry->qp.consume_size != produce_size) { return VMCI_ERROR_QUEUEPAIR_MISMATCH; } if (context_id != VMCI_HOST_CONTEXT_ID) { /* * If a guest attached to a queue pair, it will supply * the backing memory. If this is a pre NOVMVM vmx, * the backing memory will be supplied by calling * vmci_qp_broker_set_page_store() following the * return of the vmci_qp_broker_alloc() call. If it is * a vmx of version NOVMVM or later, the page store * must be supplied as part of the * vmci_qp_broker_alloc call. Under all circumstances * must the initially created queue pair not have any * memory associated with it already. */ if (entry->state != VMCIQPB_CREATED_NO_MEM) return VMCI_ERROR_INVALID_ARGS; if (page_store != NULL) { /* * Patch up host state to point to guest * supplied memory. The VMX already * initialized the queue pair headers, so no * need for the kernel side to do that. */ result = qp_host_register_user_memory(page_store, entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) return result; entry->state = VMCIQPB_ATTACHED_MEM; } else { entry->state = VMCIQPB_ATTACHED_NO_MEM; } } else if (entry->state == VMCIQPB_CREATED_NO_MEM) { /* * The host side is attempting to attach to a queue * pair that doesn't have any memory associated with * it. This must be a pre NOVMVM vmx that hasn't set * the page store information yet, or a quiesced VM. */ return VMCI_ERROR_UNAVAILABLE; } else { /* The host side has successfully attached to a queue pair. */ entry->state = VMCIQPB_ATTACHED_MEM; } if (entry->state == VMCIQPB_ATTACHED_MEM) { result = qp_notify_peer(true, entry->qp.handle, context_id, entry->create_id); if (result < VMCI_SUCCESS) pr_warn("Failed to notify peer (ID=0x%x) of attach to queue pair (handle=0x%x:0x%x)\n", entry->create_id, entry->qp.handle.context, entry->qp.handle.resource); } entry->attach_id = context_id; entry->qp.ref_count++; if (wakeup_cb) { entry->wakeup_cb = wakeup_cb; entry->client_data = client_data; } /* * When attaching to local queue pairs, the context already has * an entry tracking the queue pair, so don't add another one. */ if (!is_local) vmci_ctx_qp_create(context, entry->qp.handle); if (ent != NULL) *ent = entry; return VMCI_SUCCESS; } /* * queue_pair_Alloc for use when setting up queue pair endpoints * on the host. */ static int qp_broker_alloc(struct vmci_handle handle, u32 peer, u32 flags, u32 priv_flags, u64 produce_size, u64 consume_size, struct vmci_qp_page_store *page_store, struct vmci_ctx *context, vmci_event_release_cb wakeup_cb, void *client_data, struct qp_broker_entry **ent, bool *swap) { const u32 context_id = vmci_ctx_get_id(context); bool create; struct qp_broker_entry *entry = NULL; bool is_local = flags & VMCI_QPFLAG_LOCAL; int result; if (vmci_handle_is_invalid(handle) || (flags & ~VMCI_QP_ALL_FLAGS) || is_local || !(produce_size || consume_size) || !context || context_id == VMCI_INVALID_ID || handle.context == VMCI_INVALID_ID) { return VMCI_ERROR_INVALID_ARGS; } if (page_store && !VMCI_QP_PAGESTORE_IS_WELLFORMED(page_store)) return VMCI_ERROR_INVALID_ARGS; /* * In the initial argument check, we ensure that non-vmkernel hosts * are not allowed to create local queue pairs. */ mutex_lock(&qp_broker_list.mutex); if (!is_local && vmci_ctx_qp_exists(context, handle)) { pr_devel("Context (ID=0x%x) already attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); mutex_unlock(&qp_broker_list.mutex); return VMCI_ERROR_ALREADY_EXISTS; } if (handle.resource != VMCI_INVALID_ID) entry = qp_broker_handle_to_entry(handle); if (!entry) { create = true; result = qp_broker_create(handle, peer, flags, priv_flags, produce_size, consume_size, page_store, context, wakeup_cb, client_data, ent); } else { create = false; result = qp_broker_attach(entry, peer, flags, priv_flags, produce_size, consume_size, page_store, context, wakeup_cb, client_data, ent); } mutex_unlock(&qp_broker_list.mutex); if (swap) *swap = (context_id == VMCI_HOST_CONTEXT_ID) && !(create && is_local); return result; } /* * This function implements the kernel API for allocating a queue * pair. */ static int qp_alloc_host_work(struct vmci_handle *handle, struct vmci_queue **produce_q, u64 produce_size, struct vmci_queue **consume_q, u64 consume_size, u32 peer, u32 flags, u32 priv_flags, vmci_event_release_cb wakeup_cb, void *client_data) { struct vmci_handle new_handle; struct vmci_ctx *context; struct qp_broker_entry *entry; int result; bool swap; if (vmci_handle_is_invalid(*handle)) { new_handle = vmci_make_handle( VMCI_HOST_CONTEXT_ID, VMCI_INVALID_ID); } else new_handle = *handle; context = vmci_ctx_get(VMCI_HOST_CONTEXT_ID); entry = NULL; result = qp_broker_alloc(new_handle, peer, flags, priv_flags, produce_size, consume_size, NULL, context, wakeup_cb, client_data, &entry, &swap); if (result == VMCI_SUCCESS) { if (swap) { /* * If this is a local queue pair, the attacher * will swap around produce and consume * queues. */ *produce_q = entry->consume_q; *consume_q = entry->produce_q; } else { *produce_q = entry->produce_q; *consume_q = entry->consume_q; } *handle = vmci_resource_handle(&entry->resource); } else { *handle = VMCI_INVALID_HANDLE; pr_devel("queue pair broker failed to alloc (result=%d)\n", result); } vmci_ctx_put(context); return result; } /* * Allocates a VMCI queue_pair. Only checks validity of input * arguments. The real work is done in the host or guest * specific function. */ int vmci_qp_alloc(struct vmci_handle *handle, struct vmci_queue **produce_q, u64 produce_size, struct vmci_queue **consume_q, u64 consume_size, u32 peer, u32 flags, u32 priv_flags, bool guest_endpoint, vmci_event_release_cb wakeup_cb, void *client_data) { if (!handle || !produce_q || !consume_q || (!produce_size && !consume_size) || (flags & ~VMCI_QP_ALL_FLAGS)) return VMCI_ERROR_INVALID_ARGS; if (guest_endpoint) { return qp_alloc_guest_work(handle, produce_q, produce_size, consume_q, consume_size, peer, flags, priv_flags); } else { return qp_alloc_host_work(handle, produce_q, produce_size, consume_q, consume_size, peer, flags, priv_flags, wakeup_cb, client_data); } } /* * This function implements the host kernel API for detaching from * a queue pair. */ static int qp_detatch_host_work(struct vmci_handle handle) { int result; struct vmci_ctx *context; context = vmci_ctx_get(VMCI_HOST_CONTEXT_ID); result = vmci_qp_broker_detach(handle, context); vmci_ctx_put(context); return result; } /* * Detaches from a VMCI queue_pair. Only checks validity of input argument. * Real work is done in the host or guest specific function. */ static int qp_detatch(struct vmci_handle handle, bool guest_endpoint) { if (vmci_handle_is_invalid(handle)) return VMCI_ERROR_INVALID_ARGS; if (guest_endpoint) return qp_detatch_guest_work(handle); else return qp_detatch_host_work(handle); } /* * Returns the entry from the head of the list. Assumes that the list is * locked. */ static struct qp_entry *qp_list_get_head(struct qp_list *qp_list) { if (!list_empty(&qp_list->head)) { struct qp_entry *entry = list_first_entry(&qp_list->head, struct qp_entry, list_item); return entry; } return NULL; } void vmci_qp_broker_exit(void) { struct qp_entry *entry; struct qp_broker_entry *be; mutex_lock(&qp_broker_list.mutex); while ((entry = qp_list_get_head(&qp_broker_list))) { be = (struct qp_broker_entry *)entry; qp_list_remove_entry(&qp_broker_list, entry); kfree(be); } mutex_unlock(&qp_broker_list.mutex); } /* * Requests that a queue pair be allocated with the VMCI queue * pair broker. Allocates a queue pair entry if one does not * exist. Attaches to one if it exists, and retrieves the page * files backing that queue_pair. Assumes that the queue pair * broker lock is held. */ int vmci_qp_broker_alloc(struct vmci_handle handle, u32 peer, u32 flags, u32 priv_flags, u64 produce_size, u64 consume_size, struct vmci_qp_page_store *page_store, struct vmci_ctx *context) { if (!QP_SIZES_ARE_VALID(produce_size, consume_size)) return VMCI_ERROR_NO_RESOURCES; return qp_broker_alloc(handle, peer, flags, priv_flags, produce_size, consume_size, page_store, context, NULL, NULL, NULL, NULL); } /* * VMX'en with versions lower than VMCI_VERSION_NOVMVM use a separate * step to add the UVAs of the VMX mapping of the queue pair. This function * provides backwards compatibility with such VMX'en, and takes care of * registering the page store for a queue pair previously allocated by the * VMX during create or attach. This function will move the queue pair state * to either from VMCIQBP_CREATED_NO_MEM to VMCIQBP_CREATED_MEM or * VMCIQBP_ATTACHED_NO_MEM to VMCIQBP_ATTACHED_MEM. If moving to the * attached state with memory, the queue pair is ready to be used by the * host peer, and an attached event will be generated. * * Assumes that the queue pair broker lock is held. * * This function is only used by the hosted platform, since there is no * issue with backwards compatibility for vmkernel. */ int vmci_qp_broker_set_page_store(struct vmci_handle handle, u64 produce_uva, u64 consume_uva, struct vmci_ctx *context) { struct qp_broker_entry *entry; int result; const u32 context_id = vmci_ctx_get_id(context); if (vmci_handle_is_invalid(handle) || !context || context_id == VMCI_INVALID_ID) return VMCI_ERROR_INVALID_ARGS; /* * We only support guest to host queue pairs, so the VMX must * supply UVAs for the mapped page files. */ if (produce_uva == 0 || consume_uva == 0) return VMCI_ERROR_INVALID_ARGS; mutex_lock(&qp_broker_list.mutex); if (!vmci_ctx_qp_exists(context, handle)) { pr_warn("Context (ID=0x%x) not attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } entry = qp_broker_handle_to_entry(handle); if (!entry) { result = VMCI_ERROR_NOT_FOUND; goto out; } /* * If I'm the owner then I can set the page store. * * Or, if a host created the queue_pair and I'm the attached peer * then I can set the page store. */ if (entry->create_id != context_id && (entry->create_id != VMCI_HOST_CONTEXT_ID || entry->attach_id != context_id)) { result = VMCI_ERROR_QUEUEPAIR_NOTOWNER; goto out; } if (entry->state != VMCIQPB_CREATED_NO_MEM && entry->state != VMCIQPB_ATTACHED_NO_MEM) { result = VMCI_ERROR_UNAVAILABLE; goto out; } result = qp_host_get_user_memory(produce_uva, consume_uva, entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) goto out; result = qp_host_map_queues(entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) { qp_host_unregister_user_memory(entry->produce_q, entry->consume_q); goto out; } if (entry->state == VMCIQPB_CREATED_NO_MEM) entry->state = VMCIQPB_CREATED_MEM; else entry->state = VMCIQPB_ATTACHED_MEM; entry->vmci_page_files = true; if (entry->state == VMCIQPB_ATTACHED_MEM) { result = qp_notify_peer(true, handle, context_id, entry->create_id); if (result < VMCI_SUCCESS) { pr_warn("Failed to notify peer (ID=0x%x) of attach to queue pair (handle=0x%x:0x%x)\n", entry->create_id, entry->qp.handle.context, entry->qp.handle.resource); } } result = VMCI_SUCCESS; out: mutex_unlock(&qp_broker_list.mutex); return result; } /* * Resets saved queue headers for the given QP broker * entry. Should be used when guest memory becomes available * again, or the guest detaches. */ static void qp_reset_saved_headers(struct qp_broker_entry *entry) { entry->produce_q->saved_header = NULL; entry->consume_q->saved_header = NULL; } /* * The main entry point for detaching from a queue pair registered with the * queue pair broker. If more than one endpoint is attached to the queue * pair, the first endpoint will mainly decrement a reference count and * generate a notification to its peer. The last endpoint will clean up * the queue pair state registered with the broker. * * When a guest endpoint detaches, it will unmap and unregister the guest * memory backing the queue pair. If the host is still attached, it will * no longer be able to access the queue pair content. * * If the queue pair is already in a state where there is no memory * registered for the queue pair (any *_NO_MEM state), it will transition to * the VMCIQPB_SHUTDOWN_NO_MEM state. This will also happen, if a guest * endpoint is the first of two endpoints to detach. If the host endpoint is * the first out of two to detach, the queue pair will move to the * VMCIQPB_SHUTDOWN_MEM state. */ int vmci_qp_broker_detach(struct vmci_handle handle, struct vmci_ctx *context) { struct qp_broker_entry *entry; const u32 context_id = vmci_ctx_get_id(context); u32 peer_id; bool is_local = false; int result; if (vmci_handle_is_invalid(handle) || !context || context_id == VMCI_INVALID_ID) { return VMCI_ERROR_INVALID_ARGS; } mutex_lock(&qp_broker_list.mutex); if (!vmci_ctx_qp_exists(context, handle)) { pr_devel("Context (ID=0x%x) not attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } entry = qp_broker_handle_to_entry(handle); if (!entry) { pr_devel("Context (ID=0x%x) reports being attached to queue pair(handle=0x%x:0x%x) that isn't present in broker\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } if (context_id != entry->create_id && context_id != entry->attach_id) { result = VMCI_ERROR_QUEUEPAIR_NOTATTACHED; goto out; } if (context_id == entry->create_id) { peer_id = entry->attach_id; entry->create_id = VMCI_INVALID_ID; } else { peer_id = entry->create_id; entry->attach_id = VMCI_INVALID_ID; } entry->qp.ref_count--; is_local = entry->qp.flags & VMCI_QPFLAG_LOCAL; if (context_id != VMCI_HOST_CONTEXT_ID) { bool headers_mapped; /* * Pre NOVMVM vmx'en may detach from a queue pair * before setting the page store, and in that case * there is no user memory to detach from. Also, more * recent VMX'en may detach from a queue pair in the * quiesced state. */ qp_acquire_queue_mutex(entry->produce_q); headers_mapped = entry->produce_q->q_header || entry->consume_q->q_header; if (QPBROKERSTATE_HAS_MEM(entry)) { result = qp_host_unmap_queues(INVALID_VMCI_GUEST_MEM_ID, entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) pr_warn("Failed to unmap queue headers for queue pair (handle=0x%x:0x%x,result=%d)\n", handle.context, handle.resource, result); qp_host_unregister_user_memory(entry->produce_q, entry->consume_q); } if (!headers_mapped) qp_reset_saved_headers(entry); qp_release_queue_mutex(entry->produce_q); if (!headers_mapped && entry->wakeup_cb) entry->wakeup_cb(entry->client_data); } else { if (entry->wakeup_cb) { entry->wakeup_cb = NULL; entry->client_data = NULL; } } if (entry->qp.ref_count == 0) { qp_list_remove_entry(&qp_broker_list, &entry->qp); if (is_local) kfree(entry->local_mem); qp_cleanup_queue_mutex(entry->produce_q, entry->consume_q); qp_host_free_queue(entry->produce_q, entry->qp.produce_size); qp_host_free_queue(entry->consume_q, entry->qp.consume_size); /* Unlink from resource hash table and free callback */ vmci_resource_remove(&entry->resource); kfree(entry); vmci_ctx_qp_destroy(context, handle); } else { qp_notify_peer(false, handle, context_id, peer_id); if (context_id == VMCI_HOST_CONTEXT_ID && QPBROKERSTATE_HAS_MEM(entry)) { entry->state = VMCIQPB_SHUTDOWN_MEM; } else { entry->state = VMCIQPB_SHUTDOWN_NO_MEM; } if (!is_local) vmci_ctx_qp_destroy(context, handle); } result = VMCI_SUCCESS; out: mutex_unlock(&qp_broker_list.mutex); return result; } /* * Establishes the necessary mappings for a queue pair given a * reference to the queue pair guest memory. This is usually * called when a guest is unquiesced and the VMX is allowed to * map guest memory once again. */ int vmci_qp_broker_map(struct vmci_handle handle, struct vmci_ctx *context, u64 guest_mem) { struct qp_broker_entry *entry; const u32 context_id = vmci_ctx_get_id(context); int result; if (vmci_handle_is_invalid(handle) || !context || context_id == VMCI_INVALID_ID) return VMCI_ERROR_INVALID_ARGS; mutex_lock(&qp_broker_list.mutex); if (!vmci_ctx_qp_exists(context, handle)) { pr_devel("Context (ID=0x%x) not attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } entry = qp_broker_handle_to_entry(handle); if (!entry) { pr_devel("Context (ID=0x%x) reports being attached to queue pair (handle=0x%x:0x%x) that isn't present in broker\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } if (context_id != entry->create_id && context_id != entry->attach_id) { result = VMCI_ERROR_QUEUEPAIR_NOTATTACHED; goto out; } result = VMCI_SUCCESS; if (context_id != VMCI_HOST_CONTEXT_ID && !QPBROKERSTATE_HAS_MEM(entry)) { struct vmci_qp_page_store page_store; page_store.pages = guest_mem; page_store.len = QPE_NUM_PAGES(entry->qp); qp_acquire_queue_mutex(entry->produce_q); qp_reset_saved_headers(entry); result = qp_host_register_user_memory(&page_store, entry->produce_q, entry->consume_q); qp_release_queue_mutex(entry->produce_q); if (result == VMCI_SUCCESS) { /* Move state from *_NO_MEM to *_MEM */ entry->state++; if (entry->wakeup_cb) entry->wakeup_cb(entry->client_data); } } out: mutex_unlock(&qp_broker_list.mutex); return result; } /* * Saves a snapshot of the queue headers for the given QP broker * entry. Should be used when guest memory is unmapped. * Results: * VMCI_SUCCESS on success, appropriate error code if guest memory * can't be accessed.. */ static int qp_save_headers(struct qp_broker_entry *entry) { int result; if (entry->produce_q->saved_header != NULL && entry->consume_q->saved_header != NULL) { /* * If the headers have already been saved, we don't need to do * it again, and we don't want to map in the headers * unnecessarily. */ return VMCI_SUCCESS; } if (NULL == entry->produce_q->q_header || NULL == entry->consume_q->q_header) { result = qp_host_map_queues(entry->produce_q, entry->consume_q); if (result < VMCI_SUCCESS) return result; } memcpy(&entry->saved_produce_q, entry->produce_q->q_header, sizeof(entry->saved_produce_q)); entry->produce_q->saved_header = &entry->saved_produce_q; memcpy(&entry->saved_consume_q, entry->consume_q->q_header, sizeof(entry->saved_consume_q)); entry->consume_q->saved_header = &entry->saved_consume_q; return VMCI_SUCCESS; } /* * Removes all references to the guest memory of a given queue pair, and * will move the queue pair from state *_MEM to *_NO_MEM. It is usually * called when a VM is being quiesced where access to guest memory should * avoided. */ int vmci_qp_broker_unmap(struct vmci_handle handle, struct vmci_ctx *context, u32 gid) { struct qp_broker_entry *entry; const u32 context_id = vmci_ctx_get_id(context); int result; if (vmci_handle_is_invalid(handle) || !context || context_id == VMCI_INVALID_ID) return VMCI_ERROR_INVALID_ARGS; mutex_lock(&qp_broker_list.mutex); if (!vmci_ctx_qp_exists(context, handle)) { pr_devel("Context (ID=0x%x) not attached to queue pair (handle=0x%x:0x%x)\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } entry = qp_broker_handle_to_entry(handle); if (!entry) { pr_devel("Context (ID=0x%x) reports being attached to queue pair (handle=0x%x:0x%x) that isn't present in broker\n", context_id, handle.context, handle.resource); result = VMCI_ERROR_NOT_FOUND; goto out; } if (context_id != entry->create_id && context_id != entry->attach_id) { result = VMCI_ERROR_QUEUEPAIR_NOTATTACHED; goto out; } if (context_id != VMCI_HOST_CONTEXT_ID && QPBROKERSTATE_HAS_MEM(entry)) { qp_acquire_queue_mutex(entry->produce_q); result = qp_save_headers(entry); if (result < VMCI_SUCCESS) pr_warn("Failed to save queue headers for queue pair (handle=0x%x:0x%x,result=%d)\n", handle.context, handle.resource, result); qp_host_unmap_queues(gid, entry->produce_q, entry->consume_q); /* * On hosted, when we unmap queue pairs, the VMX will also * unmap the guest memory, so we invalidate the previously * registered memory. If the queue pair is mapped again at a * later point in time, we will need to reregister the user * memory with a possibly new user VA. */ qp_host_unregister_user_memory(entry->produce_q, entry->consume_q); /* * Move state from *_MEM to *_NO_MEM. */ entry->state--; qp_release_queue_mutex(entry->produce_q); } result = VMCI_SUCCESS; out: mutex_unlock(&qp_broker_list.mutex); return result; } /* * Destroys all guest queue pair endpoints. If active guest queue * pairs still exist, hypercalls to attempt detach from these * queue pairs will be made. Any failure to detach is silently * ignored. */ void vmci_qp_guest_endpoints_exit(void) { struct qp_entry *entry; struct qp_guest_endpoint *ep; mutex_lock(&qp_guest_endpoints.mutex); while ((entry = qp_list_get_head(&qp_guest_endpoints))) { ep = (struct qp_guest_endpoint *)entry; /* Don't make a hypercall for local queue_pairs. */ if (!(entry->flags & VMCI_QPFLAG_LOCAL)) qp_detatch_hypercall(entry->handle); /* We cannot fail the exit, so let's reset ref_count. */ entry->ref_count = 0; qp_list_remove_entry(&qp_guest_endpoints, entry); qp_guest_endpoint_destroy(ep); } mutex_unlock(&qp_guest_endpoints.mutex); } /* * Helper routine that will lock the queue pair before subsequent * operations. * Note: Non-blocking on the host side is currently only implemented in ESX. * Since non-blocking isn't yet implemented on the host personality we * have no reason to acquire a spin lock. So to avoid the use of an * unnecessary lock only acquire the mutex if we can block. */ static void qp_lock(const struct vmci_qp *qpair) { qp_acquire_queue_mutex(qpair->produce_q); } /* * Helper routine that unlocks the queue pair after calling * qp_lock. */ static void qp_unlock(const struct vmci_qp *qpair) { qp_release_queue_mutex(qpair->produce_q); } /* * The queue headers may not be mapped at all times. If a queue is * currently not mapped, it will be attempted to do so. */ static int qp_map_queue_headers(struct vmci_queue *produce_q, struct vmci_queue *consume_q) { int result; if (NULL == produce_q->q_header || NULL == consume_q->q_header) { result = qp_host_map_queues(produce_q, consume_q); if (result < VMCI_SUCCESS) return (produce_q->saved_header && consume_q->saved_header) ? VMCI_ERROR_QUEUEPAIR_NOT_READY : VMCI_ERROR_QUEUEPAIR_NOTATTACHED; } return VMCI_SUCCESS; } /* * Helper routine that will retrieve the produce and consume * headers of a given queue pair. If the guest memory of the * queue pair is currently not available, the saved queue headers * will be returned, if these are available. */ static int qp_get_queue_headers(const struct vmci_qp *qpair, struct vmci_queue_header **produce_q_header, struct vmci_queue_header **consume_q_header) { int result; result = qp_map_queue_headers(qpair->produce_q, qpair->consume_q); if (result == VMCI_SUCCESS) { *produce_q_header = qpair->produce_q->q_header; *consume_q_header = qpair->consume_q->q_header; } else if (qpair->produce_q->saved_header && qpair->consume_q->saved_header) { *produce_q_header = qpair->produce_q->saved_header; *consume_q_header = qpair->consume_q->saved_header; result = VMCI_SUCCESS; } return result; } /* * Callback from VMCI queue pair broker indicating that a queue * pair that was previously not ready, now either is ready or * gone forever. */ static int qp_wakeup_cb(void *client_data) { struct vmci_qp *qpair = (struct vmci_qp *)client_data; qp_lock(qpair); while (qpair->blocked > 0) { qpair->blocked--; qpair->generation++; wake_up(&qpair->event); } qp_unlock(qpair); return VMCI_SUCCESS; } /* * Makes the calling thread wait for the queue pair to become * ready for host side access. Returns true when thread is * woken up after queue pair state change, false otherwise. */ static bool qp_wait_for_ready_queue(struct vmci_qp *qpair) { unsigned int generation; qpair->blocked++; generation = qpair->generation; qp_unlock(qpair); wait_event(qpair->event, generation != qpair->generation); qp_lock(qpair); return true; } /* * Enqueues a given buffer to the produce queue using the provided * function. As many bytes as possible (space available in the queue) * are enqueued. Assumes the queue->mutex has been acquired. Returns * VMCI_ERROR_QUEUEPAIR_NOSPACE if no space was available to enqueue * data, VMCI_ERROR_INVALID_SIZE, if any queue pointer is outside the * queue (as defined by the queue size), VMCI_ERROR_INVALID_ARGS, if * an error occured when accessing the buffer, * VMCI_ERROR_QUEUEPAIR_NOTATTACHED, if the queue pair pages aren't * available. Otherwise, the number of bytes written to the queue is * returned. Updates the tail pointer of the produce queue. */ static ssize_t qp_enqueue_locked(struct vmci_queue *produce_q, struct vmci_queue *consume_q, const u64 produce_q_size, struct iov_iter *from) { s64 free_space; u64 tail; size_t buf_size = iov_iter_count(from); size_t written; ssize_t result; result = qp_map_queue_headers(produce_q, consume_q); if (unlikely(result != VMCI_SUCCESS)) return result; free_space = vmci_q_header_free_space(produce_q->q_header, consume_q->q_header, produce_q_size); if (free_space == 0) return VMCI_ERROR_QUEUEPAIR_NOSPACE; if (free_space < VMCI_SUCCESS) return (ssize_t) free_space; written = (size_t) (free_space > buf_size ? buf_size : free_space); tail = vmci_q_header_producer_tail(produce_q->q_header); if (likely(tail + written < produce_q_size)) { result = qp_memcpy_to_queue_iter(produce_q, tail, from, written); } else { /* Tail pointer wraps around. */ const size_t tmp = (size_t) (produce_q_size - tail); result = qp_memcpy_to_queue_iter(produce_q, tail, from, tmp); if (result >= VMCI_SUCCESS) result = qp_memcpy_to_queue_iter(produce_q, 0, from, written - tmp); } if (result < VMCI_SUCCESS) return result; /* * This virt_wmb() ensures that data written to the queue * is observable before the new producer_tail is. */ virt_wmb(); vmci_q_header_add_producer_tail(produce_q->q_header, written, produce_q_size); return written; } /* * Dequeues data (if available) from the given consume queue. Writes data * to the user provided buffer using the provided function. * Assumes the queue->mutex has been acquired. * Results: * VMCI_ERROR_QUEUEPAIR_NODATA if no data was available to dequeue. * VMCI_ERROR_INVALID_SIZE, if any queue pointer is outside the queue * (as defined by the queue size). * VMCI_ERROR_INVALID_ARGS, if an error occured when accessing the buffer. * Otherwise the number of bytes dequeued is returned. * Side effects: * Updates the head pointer of the consume queue. */ static ssize_t qp_dequeue_locked(struct vmci_queue *produce_q, struct vmci_queue *consume_q, const u64 consume_q_size, struct iov_iter *to, bool update_consumer) { size_t buf_size = iov_iter_count(to); s64 buf_ready; u64 head; size_t read; ssize_t result; result = qp_map_queue_headers(produce_q, consume_q); if (unlikely(result != VMCI_SUCCESS)) return result; buf_ready = vmci_q_header_buf_ready(consume_q->q_header, produce_q->q_header, consume_q_size); if (buf_ready == 0) return VMCI_ERROR_QUEUEPAIR_NODATA; if (buf_ready < VMCI_SUCCESS) return (ssize_t) buf_ready; /* * This virt_rmb() ensures that data from the queue will be read * after we have determined how much is ready to be consumed. */ virt_rmb(); read = (size_t) (buf_ready > buf_size ? buf_size : buf_ready); head = vmci_q_header_consumer_head(produce_q->q_header); if (likely(head + read < consume_q_size)) { result = qp_memcpy_from_queue_iter(to, consume_q, head, read); } else { /* Head pointer wraps around. */ const size_t tmp = (size_t) (consume_q_size - head); result = qp_memcpy_from_queue_iter(to, consume_q, head, tmp); if (result >= VMCI_SUCCESS) result = qp_memcpy_from_queue_iter(to, consume_q, 0, read - tmp); } if (result < VMCI_SUCCESS) return result; if (update_consumer) vmci_q_header_add_consumer_head(produce_q->q_header, read, consume_q_size); return read; } /* * vmci_qpair_alloc() - Allocates a queue pair. * @qpair: Pointer for the new vmci_qp struct. * @handle: Handle to track the resource. * @produce_qsize: Desired size of the producer queue. * @consume_qsize: Desired size of the consumer queue. * @peer: ContextID of the peer. * @flags: VMCI flags. * @priv_flags: VMCI priviledge flags. * * This is the client interface for allocating the memory for a * vmci_qp structure and then attaching to the underlying * queue. If an error occurs allocating the memory for the * vmci_qp structure no attempt is made to attach. If an * error occurs attaching, then the structure is freed. */ int vmci_qpair_alloc(struct vmci_qp **qpair, struct vmci_handle *handle, u64 produce_qsize, u64 consume_qsize, u32 peer, u32 flags, u32 priv_flags) { struct vmci_qp *my_qpair; int retval; struct vmci_handle src = VMCI_INVALID_HANDLE; struct vmci_handle dst = vmci_make_handle(peer, VMCI_INVALID_ID); enum vmci_route route; vmci_event_release_cb wakeup_cb; void *client_data; /* * Restrict the size of a queuepair. The device already * enforces a limit on the total amount of memory that can be * allocated to queuepairs for a guest. However, we try to * allocate this memory before we make the queuepair * allocation hypercall. On Linux, we allocate each page * separately, which means rather than fail, the guest will * thrash while it tries to allocate, and will become * increasingly unresponsive to the point where it appears to * be hung. So we place a limit on the size of an individual * queuepair here, and leave the device to enforce the * restriction on total queuepair memory. (Note that this * doesn't prevent all cases; a user with only this much * physical memory could still get into trouble.) The error * used by the device is NO_RESOURCES, so use that here too. */ if (!QP_SIZES_ARE_VALID(produce_qsize, consume_qsize)) return VMCI_ERROR_NO_RESOURCES; retval = vmci_route(&src, &dst, false, &route); if (retval < VMCI_SUCCESS) route = vmci_guest_code_active() ? VMCI_ROUTE_AS_GUEST : VMCI_ROUTE_AS_HOST; if (flags & (VMCI_QPFLAG_NONBLOCK | VMCI_QPFLAG_PINNED)) { pr_devel("NONBLOCK OR PINNED set"); return VMCI_ERROR_INVALID_ARGS; } my_qpair = kzalloc(sizeof(*my_qpair), GFP_KERNEL); if (!my_qpair) return VMCI_ERROR_NO_MEM; my_qpair->produce_q_size = produce_qsize; my_qpair->consume_q_size = consume_qsize; my_qpair->peer = peer; my_qpair->flags = flags; my_qpair->priv_flags = priv_flags; wakeup_cb = NULL; client_data = NULL; if (VMCI_ROUTE_AS_HOST == route) { my_qpair->guest_endpoint = false; if (!(flags & VMCI_QPFLAG_LOCAL)) { my_qpair->blocked = 0; my_qpair->generation = 0; init_waitqueue_head(&my_qpair->event); wakeup_cb = qp_wakeup_cb; client_data = (void *)my_qpair; } } else { my_qpair->guest_endpoint = true; } retval = vmci_qp_alloc(handle, &my_qpair->produce_q, my_qpair->produce_q_size, &my_qpair->consume_q, my_qpair->consume_q_size, my_qpair->peer, my_qpair->flags, my_qpair->priv_flags, my_qpair->guest_endpoint, wakeup_cb, client_data); if (retval < VMCI_SUCCESS) { kfree(my_qpair); return retval; } *qpair = my_qpair; my_qpair->handle = *handle; return retval; } EXPORT_SYMBOL_GPL(vmci_qpair_alloc); /* * vmci_qpair_detach() - Detatches the client from a queue pair. * @qpair: Reference of a pointer to the qpair struct. * * This is the client interface for detaching from a VMCIQPair. * Note that this routine will free the memory allocated for the * vmci_qp structure too. */ int vmci_qpair_detach(struct vmci_qp **qpair) { int result; struct vmci_qp *old_qpair; if (!qpair || !(*qpair)) return VMCI_ERROR_INVALID_ARGS; old_qpair = *qpair; result = qp_detatch(old_qpair->handle, old_qpair->guest_endpoint); /* * The guest can fail to detach for a number of reasons, and * if it does so, it will cleanup the entry (if there is one). * The host can fail too, but it won't cleanup the entry * immediately, it will do that later when the context is * freed. Either way, we need to release the qpair struct * here; there isn't much the caller can do, and we don't want * to leak. */ memset(old_qpair, 0, sizeof(*old_qpair)); old_qpair->handle = VMCI_INVALID_HANDLE; old_qpair->peer = VMCI_INVALID_ID; kfree(old_qpair); *qpair = NULL; return result; } EXPORT_SYMBOL_GPL(vmci_qpair_detach); /* * vmci_qpair_get_produce_indexes() - Retrieves the indexes of the producer. * @qpair: Pointer to the queue pair struct. * @producer_tail: Reference used for storing producer tail index. * @consumer_head: Reference used for storing the consumer head index. * * This is the client interface for getting the current indexes of the * QPair from the point of the view of the caller as the producer. */ int vmci_qpair_get_produce_indexes(const struct vmci_qp *qpair, u64 *producer_tail, u64 *consumer_head) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; int result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) vmci_q_header_get_pointers(produce_q_header, consume_q_header, producer_tail, consumer_head); qp_unlock(qpair); if (result == VMCI_SUCCESS && ((producer_tail && *producer_tail >= qpair->produce_q_size) || (consumer_head && *consumer_head >= qpair->produce_q_size))) return VMCI_ERROR_INVALID_SIZE; return result; } EXPORT_SYMBOL_GPL(vmci_qpair_get_produce_indexes); /* * vmci_qpair_get_consume_indexes() - Retrieves the indexes of the consumer. * @qpair: Pointer to the queue pair struct. * @consumer_tail: Reference used for storing consumer tail index. * @producer_head: Reference used for storing the producer head index. * * This is the client interface for getting the current indexes of the * QPair from the point of the view of the caller as the consumer. */ int vmci_qpair_get_consume_indexes(const struct vmci_qp *qpair, u64 *consumer_tail, u64 *producer_head) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; int result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) vmci_q_header_get_pointers(consume_q_header, produce_q_header, consumer_tail, producer_head); qp_unlock(qpair); if (result == VMCI_SUCCESS && ((consumer_tail && *consumer_tail >= qpair->consume_q_size) || (producer_head && *producer_head >= qpair->consume_q_size))) return VMCI_ERROR_INVALID_SIZE; return result; } EXPORT_SYMBOL_GPL(vmci_qpair_get_consume_indexes); /* * vmci_qpair_produce_free_space() - Retrieves free space in producer queue. * @qpair: Pointer to the queue pair struct. * * This is the client interface for getting the amount of free * space in the QPair from the point of the view of the caller as * the producer which is the common case. Returns < 0 if err, else * available bytes into which data can be enqueued if > 0. */ s64 vmci_qpair_produce_free_space(const struct vmci_qp *qpair) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; s64 result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) result = vmci_q_header_free_space(produce_q_header, consume_q_header, qpair->produce_q_size); else result = 0; qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_produce_free_space); /* * vmci_qpair_consume_free_space() - Retrieves free space in consumer queue. * @qpair: Pointer to the queue pair struct. * * This is the client interface for getting the amount of free * space in the QPair from the point of the view of the caller as * the consumer which is not the common case. Returns < 0 if err, else * available bytes into which data can be enqueued if > 0. */ s64 vmci_qpair_consume_free_space(const struct vmci_qp *qpair) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; s64 result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) result = vmci_q_header_free_space(consume_q_header, produce_q_header, qpair->consume_q_size); else result = 0; qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_consume_free_space); /* * vmci_qpair_produce_buf_ready() - Gets bytes ready to read from * producer queue. * @qpair: Pointer to the queue pair struct. * * This is the client interface for getting the amount of * enqueued data in the QPair from the point of the view of the * caller as the producer which is not the common case. Returns < 0 if err, * else available bytes that may be read. */ s64 vmci_qpair_produce_buf_ready(const struct vmci_qp *qpair) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; s64 result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) result = vmci_q_header_buf_ready(produce_q_header, consume_q_header, qpair->produce_q_size); else result = 0; qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_produce_buf_ready); /* * vmci_qpair_consume_buf_ready() - Gets bytes ready to read from * consumer queue. * @qpair: Pointer to the queue pair struct. * * This is the client interface for getting the amount of * enqueued data in the QPair from the point of the view of the * caller as the consumer which is the normal case. Returns < 0 if err, * else available bytes that may be read. */ s64 vmci_qpair_consume_buf_ready(const struct vmci_qp *qpair) { struct vmci_queue_header *produce_q_header; struct vmci_queue_header *consume_q_header; s64 result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); result = qp_get_queue_headers(qpair, &produce_q_header, &consume_q_header); if (result == VMCI_SUCCESS) result = vmci_q_header_buf_ready(consume_q_header, produce_q_header, qpair->consume_q_size); else result = 0; qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_consume_buf_ready); /* * vmci_qpair_enqueue() - Throw data on the queue. * @qpair: Pointer to the queue pair struct. * @buf: Pointer to buffer containing data * @buf_size: Length of buffer. * @buf_type: Buffer type (Unused). * * This is the client interface for enqueueing data into the queue. * Returns number of bytes enqueued or < 0 on error. */ ssize_t vmci_qpair_enqueue(struct vmci_qp *qpair, const void *buf, size_t buf_size, int buf_type) { ssize_t result; struct iov_iter from; struct kvec v = {.iov_base = (void *)buf, .iov_len = buf_size}; if (!qpair || !buf) return VMCI_ERROR_INVALID_ARGS; iov_iter_kvec(&from, ITER_SOURCE, &v, 1, buf_size); qp_lock(qpair); do { result = qp_enqueue_locked(qpair->produce_q, qpair->consume_q, qpair->produce_q_size, &from); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_enqueue); /* * vmci_qpair_dequeue() - Get data from the queue. * @qpair: Pointer to the queue pair struct. * @buf: Pointer to buffer for the data * @buf_size: Length of buffer. * @buf_type: Buffer type (Unused). * * This is the client interface for dequeueing data from the queue. * Returns number of bytes dequeued or < 0 on error. */ ssize_t vmci_qpair_dequeue(struct vmci_qp *qpair, void *buf, size_t buf_size, int buf_type) { ssize_t result; struct iov_iter to; struct kvec v = {.iov_base = buf, .iov_len = buf_size}; if (!qpair || !buf) return VMCI_ERROR_INVALID_ARGS; iov_iter_kvec(&to, ITER_DEST, &v, 1, buf_size); qp_lock(qpair); do { result = qp_dequeue_locked(qpair->produce_q, qpair->consume_q, qpair->consume_q_size, &to, true); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_dequeue); /* * vmci_qpair_peek() - Peek at the data in the queue. * @qpair: Pointer to the queue pair struct. * @buf: Pointer to buffer for the data * @buf_size: Length of buffer. * @buf_type: Buffer type (Unused on Linux). * * This is the client interface for peeking into a queue. (I.e., * copy data from the queue without updating the head pointer.) * Returns number of bytes dequeued or < 0 on error. */ ssize_t vmci_qpair_peek(struct vmci_qp *qpair, void *buf, size_t buf_size, int buf_type) { struct iov_iter to; struct kvec v = {.iov_base = buf, .iov_len = buf_size}; ssize_t result; if (!qpair || !buf) return VMCI_ERROR_INVALID_ARGS; iov_iter_kvec(&to, ITER_DEST, &v, 1, buf_size); qp_lock(qpair); do { result = qp_dequeue_locked(qpair->produce_q, qpair->consume_q, qpair->consume_q_size, &to, false); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_peek); /* * vmci_qpair_enquev() - Throw data on the queue using iov. * @qpair: Pointer to the queue pair struct. * @iov: Pointer to buffer containing data * @iov_size: Length of buffer. * @buf_type: Buffer type (Unused). * * This is the client interface for enqueueing data into the queue. * This function uses IO vectors to handle the work. Returns number * of bytes enqueued or < 0 on error. */ ssize_t vmci_qpair_enquev(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size, int buf_type) { ssize_t result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); do { result = qp_enqueue_locked(qpair->produce_q, qpair->consume_q, qpair->produce_q_size, &msg->msg_iter); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_enquev); /* * vmci_qpair_dequev() - Get data from the queue using iov. * @qpair: Pointer to the queue pair struct. * @iov: Pointer to buffer for the data * @iov_size: Length of buffer. * @buf_type: Buffer type (Unused). * * This is the client interface for dequeueing data from the queue. * This function uses IO vectors to handle the work. Returns number * of bytes dequeued or < 0 on error. */ ssize_t vmci_qpair_dequev(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size, int buf_type) { ssize_t result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); do { result = qp_dequeue_locked(qpair->produce_q, qpair->consume_q, qpair->consume_q_size, &msg->msg_iter, true); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_dequev); /* * vmci_qpair_peekv() - Peek at the data in the queue using iov. * @qpair: Pointer to the queue pair struct. * @iov: Pointer to buffer for the data * @iov_size: Length of buffer. * @buf_type: Buffer type (Unused on Linux). * * This is the client interface for peeking into a queue. (I.e., * copy data from the queue without updating the head pointer.) * This function uses IO vectors to handle the work. Returns number * of bytes peeked or < 0 on error. */ ssize_t vmci_qpair_peekv(struct vmci_qp *qpair, struct msghdr *msg, size_t iov_size, int buf_type) { ssize_t result; if (!qpair) return VMCI_ERROR_INVALID_ARGS; qp_lock(qpair); do { result = qp_dequeue_locked(qpair->produce_q, qpair->consume_q, qpair->consume_q_size, &msg->msg_iter, false); if (result == VMCI_ERROR_QUEUEPAIR_NOT_READY && !qp_wait_for_ready_queue(qpair)) result = VMCI_ERROR_WOULD_BLOCK; } while (result == VMCI_ERROR_QUEUEPAIR_NOT_READY); qp_unlock(qpair); return result; } EXPORT_SYMBOL_GPL(vmci_qpair_peekv);
12 12 12 10 24 26 26 18 30 22 18 18 17 21 21 19 19 19 19 19 9 9 9 9 9 10 10 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "main.h" #include <linux/byteorder/generic.h> #include <linux/container_of.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_ether.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/pkt_sched.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <uapi/linux/batadv_packet.h> #include "originator.h" #include "send.h" #include "tvlv.h" /** * batadv_tvlv_handler_release() - release tvlv handler from lists and queue for * free after rcu grace period * @ref: kref pointer of the tvlv */ static void batadv_tvlv_handler_release(struct kref *ref) { struct batadv_tvlv_handler *tvlv_handler; tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount); kfree_rcu(tvlv_handler, rcu); } /** * batadv_tvlv_handler_put() - decrement the tvlv container refcounter and * possibly release it * @tvlv_handler: the tvlv handler to free */ static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) { if (!tvlv_handler) return; kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release); } /** * batadv_tvlv_handler_get() - retrieve tvlv handler from the tvlv handler list * based on the provided type and version (both need to match) * @bat_priv: the bat priv with all the soft interface information * @type: tvlv handler type to look for * @version: tvlv handler version to look for * * Return: tvlv handler if found or NULL otherwise. */ static struct batadv_tvlv_handler * batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tvlv_handler_tmp, &bat_priv->tvlv.handler_list, list) { if (tvlv_handler_tmp->type != type) continue; if (tvlv_handler_tmp->version != version) continue; if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount)) continue; tvlv_handler = tvlv_handler_tmp; break; } rcu_read_unlock(); return tvlv_handler; } /** * batadv_tvlv_container_release() - release tvlv from lists and free * @ref: kref pointer of the tvlv */ static void batadv_tvlv_container_release(struct kref *ref) { struct batadv_tvlv_container *tvlv; tvlv = container_of(ref, struct batadv_tvlv_container, refcount); kfree(tvlv); } /** * batadv_tvlv_container_put() - decrement the tvlv container refcounter and * possibly release it * @tvlv: the tvlv container to free */ static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) { if (!tvlv) return; kref_put(&tvlv->refcount, batadv_tvlv_container_release); } /** * batadv_tvlv_container_get() - retrieve tvlv container from the tvlv container * list based on the provided type and version (both need to match) * @bat_priv: the bat priv with all the soft interface information * @type: tvlv container type to look for * @version: tvlv container version to look for * * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). * * Return: tvlv container if found or NULL otherwise. */ static struct batadv_tvlv_container * batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL; lockdep_assert_held(&bat_priv->tvlv.container_list_lock); hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) { if (tvlv_tmp->tvlv_hdr.type != type) continue; if (tvlv_tmp->tvlv_hdr.version != version) continue; kref_get(&tvlv_tmp->refcount); tvlv = tvlv_tmp; break; } return tvlv; } /** * batadv_tvlv_container_list_size() - calculate the size of the tvlv container * list entries * @bat_priv: the bat priv with all the soft interface information * * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). * * Return: size of all currently registered tvlv containers in bytes. */ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) { struct batadv_tvlv_container *tvlv; u16 tvlv_len = 0; lockdep_assert_held(&bat_priv->tvlv.container_list_lock); hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { tvlv_len += sizeof(struct batadv_tvlv_hdr); tvlv_len += ntohs(tvlv->tvlv_hdr.len); } return tvlv_len; } /** * batadv_tvlv_container_remove() - remove tvlv container from the tvlv * container list * @bat_priv: the bat priv with all the soft interface information * @tvlv: the to be removed tvlv container * * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). */ static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, struct batadv_tvlv_container *tvlv) { lockdep_assert_held(&bat_priv->tvlv.container_list_lock); if (!tvlv) return; hlist_del(&tvlv->list); /* first call to decrement the counter, second call to free */ batadv_tvlv_container_put(tvlv); batadv_tvlv_container_put(tvlv); } /** * batadv_tvlv_container_unregister() - unregister tvlv container based on the * provided type and version (both need to match) * @bat_priv: the bat priv with all the soft interface information * @type: tvlv container type to unregister * @version: tvlv container type to unregister */ void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_container *tvlv; spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv = batadv_tvlv_container_get(bat_priv, type, version); batadv_tvlv_container_remove(bat_priv, tvlv); spin_unlock_bh(&bat_priv->tvlv.container_list_lock); } /** * batadv_tvlv_container_register() - register tvlv type, version and content * to be propagated with each (primary interface) OGM * @bat_priv: the bat priv with all the soft interface information * @type: tvlv container type * @version: tvlv container version * @tvlv_value: tvlv container content * @tvlv_value_len: tvlv container content length * * If a container of the same type and version was already registered the new * content is going to replace the old one. */ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, u8 type, u8 version, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_container *tvlv_old, *tvlv_new; if (!tvlv_value) tvlv_value_len = 0; tvlv_new = kzalloc(sizeof(*tvlv_new) + tvlv_value_len, GFP_ATOMIC); if (!tvlv_new) return; tvlv_new->tvlv_hdr.version = version; tvlv_new->tvlv_hdr.type = type; tvlv_new->tvlv_hdr.len = htons(tvlv_value_len); memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len)); INIT_HLIST_NODE(&tvlv_new->list); kref_init(&tvlv_new->refcount); spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); batadv_tvlv_container_remove(bat_priv, tvlv_old); kref_get(&tvlv_new->refcount); hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list); spin_unlock_bh(&bat_priv->tvlv.container_list_lock); /* don't return reference to new tvlv_container */ batadv_tvlv_container_put(tvlv_new); } /** * batadv_tvlv_realloc_packet_buff() - reallocate packet buffer to accommodate * requested packet size * @packet_buff: packet buffer * @packet_buff_len: packet buffer size * @min_packet_len: requested packet minimum size * @additional_packet_len: requested additional packet size on top of minimum * size * * Return: true of the packet buffer could be changed to the requested size, * false otherwise. */ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, int *packet_buff_len, int min_packet_len, int additional_packet_len) { unsigned char *new_buff; new_buff = kmalloc(min_packet_len + additional_packet_len, GFP_ATOMIC); /* keep old buffer if kmalloc should fail */ if (!new_buff) return false; memcpy(new_buff, *packet_buff, min_packet_len); kfree(*packet_buff); *packet_buff = new_buff; *packet_buff_len = min_packet_len + additional_packet_len; return true; } /** * batadv_tvlv_container_ogm_append() - append tvlv container content to given * OGM packet buffer * @bat_priv: the bat priv with all the soft interface information * @packet_buff: ogm packet buffer * @packet_buff_len: ogm packet buffer size including ogm header and tvlv * content * @packet_min_len: ogm header size to be preserved for the OGM itself * * The ogm packet might be enlarged or shrunk depending on the current size * and the size of the to-be-appended tvlv containers. * * Return: size of all appended tvlv containers in bytes. */ u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, unsigned char **packet_buff, int *packet_buff_len, int packet_min_len) { struct batadv_tvlv_container *tvlv; struct batadv_tvlv_hdr *tvlv_hdr; u16 tvlv_value_len; void *tvlv_value; bool ret; spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_value_len = batadv_tvlv_container_list_size(bat_priv); ret = batadv_tvlv_realloc_packet_buff(packet_buff, packet_buff_len, packet_min_len, tvlv_value_len); if (!ret) goto end; if (!tvlv_value_len) goto end; tvlv_value = (*packet_buff) + packet_min_len; hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { tvlv_hdr = tvlv_value; tvlv_hdr->type = tvlv->tvlv_hdr.type; tvlv_hdr->version = tvlv->tvlv_hdr.version; tvlv_hdr->len = tvlv->tvlv_hdr.len; tvlv_value = tvlv_hdr + 1; memcpy(tvlv_value, tvlv + 1, ntohs(tvlv->tvlv_hdr.len)); tvlv_value = (u8 *)tvlv_value + ntohs(tvlv->tvlv_hdr.len); } end: spin_unlock_bh(&bat_priv->tvlv.container_list_lock); return tvlv_value_len; } /** * batadv_tvlv_call_handler() - parse the given tvlv buffer to call the * appropriate handlers * @bat_priv: the bat priv with all the soft interface information * @tvlv_handler: tvlv callback function handling the tvlv content * @packet_type: indicates for which packet type the TVLV handler is called * @orig_node: orig node emitting the ogm packet * @skb: the skb the TVLV handler is called for * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * * Return: success if the handler was not found or the return value of the * handler callback. */ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, struct batadv_tvlv_handler *tvlv_handler, u8 packet_type, struct batadv_orig_node *orig_node, struct sk_buff *skb, void *tvlv_value, u16 tvlv_value_len) { unsigned int tvlv_offset; u8 *src, *dst; if (!tvlv_handler) return NET_RX_SUCCESS; switch (packet_type) { case BATADV_IV_OGM: case BATADV_OGM2: if (!tvlv_handler->ogm_handler) return NET_RX_SUCCESS; if (!orig_node) return NET_RX_SUCCESS; tvlv_handler->ogm_handler(bat_priv, orig_node, BATADV_NO_FLAGS, tvlv_value, tvlv_value_len); tvlv_handler->flags |= BATADV_TVLV_HANDLER_OGM_CALLED; break; case BATADV_UNICAST_TVLV: if (!skb) return NET_RX_SUCCESS; if (!tvlv_handler->unicast_handler) return NET_RX_SUCCESS; src = ((struct batadv_unicast_tvlv_packet *)skb->data)->src; dst = ((struct batadv_unicast_tvlv_packet *)skb->data)->dst; return tvlv_handler->unicast_handler(bat_priv, src, dst, tvlv_value, tvlv_value_len); case BATADV_MCAST: if (!skb) return NET_RX_SUCCESS; if (!tvlv_handler->mcast_handler) return NET_RX_SUCCESS; tvlv_offset = (unsigned char *)tvlv_value - skb->data; skb_set_network_header(skb, tvlv_offset); skb_set_transport_header(skb, tvlv_offset + tvlv_value_len); return tvlv_handler->mcast_handler(bat_priv, skb); } return NET_RX_SUCCESS; } /** * batadv_tvlv_containers_process() - parse the given tvlv buffer to call the * appropriate handlers * @bat_priv: the bat priv with all the soft interface information * @packet_type: indicates for which packet type the TVLV handler is called * @orig_node: orig node emitting the ogm packet * @skb: the skb the TVLV handler is called for * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length * * Return: success when processing an OGM or the return value of all called * handler callbacks. */ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, u8 packet_type, struct batadv_orig_node *orig_node, struct sk_buff *skb, void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_handler *tvlv_handler; struct batadv_tvlv_hdr *tvlv_hdr; u16 tvlv_value_cont_len; u8 cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND; int ret = NET_RX_SUCCESS; while (tvlv_value_len >= sizeof(*tvlv_hdr)) { tvlv_hdr = tvlv_value; tvlv_value_cont_len = ntohs(tvlv_hdr->len); tvlv_value = tvlv_hdr + 1; tvlv_value_len -= sizeof(*tvlv_hdr); if (tvlv_value_cont_len > tvlv_value_len) break; tvlv_handler = batadv_tvlv_handler_get(bat_priv, tvlv_hdr->type, tvlv_hdr->version); ret |= batadv_tvlv_call_handler(bat_priv, tvlv_handler, packet_type, orig_node, skb, tvlv_value, tvlv_value_cont_len); batadv_tvlv_handler_put(tvlv_handler); tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; tvlv_value_len -= tvlv_value_cont_len; } if (packet_type != BATADV_IV_OGM && packet_type != BATADV_OGM2) return ret; rcu_read_lock(); hlist_for_each_entry_rcu(tvlv_handler, &bat_priv->tvlv.handler_list, list) { if (!tvlv_handler->ogm_handler) continue; if ((tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) && !(tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CALLED)) tvlv_handler->ogm_handler(bat_priv, orig_node, cifnotfound, NULL, 0); tvlv_handler->flags &= ~BATADV_TVLV_HANDLER_OGM_CALLED; } rcu_read_unlock(); return NET_RX_SUCCESS; } /** * batadv_tvlv_ogm_receive() - process an incoming ogm and call the appropriate * handlers * @bat_priv: the bat priv with all the soft interface information * @batadv_ogm_packet: ogm packet containing the tvlv containers * @orig_node: orig node emitting the ogm packet */ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, struct batadv_ogm_packet *batadv_ogm_packet, struct batadv_orig_node *orig_node) { void *tvlv_value; u16 tvlv_value_len; if (!batadv_ogm_packet) return; tvlv_value_len = ntohs(batadv_ogm_packet->tvlv_len); if (!tvlv_value_len) return; tvlv_value = batadv_ogm_packet + 1; batadv_tvlv_containers_process(bat_priv, BATADV_IV_OGM, orig_node, NULL, tvlv_value, tvlv_value_len); } /** * batadv_tvlv_handler_register() - register tvlv handler based on the provided * type and version (both need to match) for ogm tvlv payload and/or unicast * payload * @bat_priv: the bat priv with all the soft interface information * @optr: ogm tvlv handler callback function. This function receives the orig * node, flags and the tvlv content as argument to process. * @uptr: unicast tvlv handler callback function. This function receives the * source & destination of the unicast packet as well as the tvlv content * to process. * @mptr: multicast packet tvlv handler callback function. This function * receives the full skb to process, with the skb network header pointing * to the current tvlv and the skb transport header pointing to the first * byte after the current tvlv. * @type: tvlv handler type to be registered * @version: tvlv handler version to be registered * @flags: flags to enable or disable TVLV API behavior */ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, void (*optr)(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len), int (*uptr)(struct batadv_priv *bat_priv, u8 *src, u8 *dst, void *tvlv_value, u16 tvlv_value_len), int (*mptr)(struct batadv_priv *bat_priv, struct sk_buff *skb), u8 type, u8 version, u8 flags) { struct batadv_tvlv_handler *tvlv_handler; spin_lock_bh(&bat_priv->tvlv.handler_list_lock); tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); if (tvlv_handler) { spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); batadv_tvlv_handler_put(tvlv_handler); return; } tvlv_handler = kzalloc(sizeof(*tvlv_handler), GFP_ATOMIC); if (!tvlv_handler) { spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); return; } tvlv_handler->ogm_handler = optr; tvlv_handler->unicast_handler = uptr; tvlv_handler->mcast_handler = mptr; tvlv_handler->type = type; tvlv_handler->version = version; tvlv_handler->flags = flags; kref_init(&tvlv_handler->refcount); INIT_HLIST_NODE(&tvlv_handler->list); kref_get(&tvlv_handler->refcount); hlist_add_head_rcu(&tvlv_handler->list, &bat_priv->tvlv.handler_list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); /* don't return reference to new tvlv_handler */ batadv_tvlv_handler_put(tvlv_handler); } /** * batadv_tvlv_handler_unregister() - unregister tvlv handler based on the * provided type and version (both need to match) * @bat_priv: the bat priv with all the soft interface information * @type: tvlv handler type to be unregistered * @version: tvlv handler version to be unregistered */ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_handler *tvlv_handler; tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version); if (!tvlv_handler) return; batadv_tvlv_handler_put(tvlv_handler); spin_lock_bh(&bat_priv->tvlv.handler_list_lock); hlist_del_rcu(&tvlv_handler->list); spin_unlock_bh(&bat_priv->tvlv.handler_list_lock); batadv_tvlv_handler_put(tvlv_handler); } /** * batadv_tvlv_unicast_send() - send a unicast packet with tvlv payload to the * specified host * @bat_priv: the bat priv with all the soft interface information * @src: source mac address of the unicast packet * @dst: destination mac address of the unicast packet * @type: tvlv type * @version: tvlv version * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length */ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, const u8 *src, const u8 *dst, u8 type, u8 version, void *tvlv_value, u16 tvlv_value_len) { struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; struct batadv_tvlv_hdr *tvlv_hdr; struct batadv_orig_node *orig_node; struct sk_buff *skb; unsigned char *tvlv_buff; unsigned int tvlv_len; ssize_t hdr_len = sizeof(*unicast_tvlv_packet); orig_node = batadv_orig_hash_find(bat_priv, dst); if (!orig_node) return; tvlv_len = sizeof(*tvlv_hdr) + tvlv_value_len; skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + hdr_len + tvlv_len); if (!skb) goto out; skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, ETH_HLEN); tvlv_buff = skb_put(skb, sizeof(*unicast_tvlv_packet) + tvlv_len); unicast_tvlv_packet = (struct batadv_unicast_tvlv_packet *)tvlv_buff; unicast_tvlv_packet->packet_type = BATADV_UNICAST_TVLV; unicast_tvlv_packet->version = BATADV_COMPAT_VERSION; unicast_tvlv_packet->ttl = BATADV_TTL; unicast_tvlv_packet->reserved = 0; unicast_tvlv_packet->tvlv_len = htons(tvlv_len); unicast_tvlv_packet->align = 0; ether_addr_copy(unicast_tvlv_packet->src, src); ether_addr_copy(unicast_tvlv_packet->dst, dst); tvlv_buff = (unsigned char *)(unicast_tvlv_packet + 1); tvlv_hdr = (struct batadv_tvlv_hdr *)tvlv_buff; tvlv_hdr->version = version; tvlv_hdr->type = type; tvlv_hdr->len = htons(tvlv_value_len); tvlv_buff += sizeof(*tvlv_hdr); memcpy(tvlv_buff, tvlv_value, tvlv_value_len); batadv_send_skb_to_orig(skb, orig_node, NULL); out: batadv_orig_node_put(orig_node); }
5 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match Hop-by-Hop and Destination parameters. */ /* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> #include <net/ipv6.h> #include <asm/byteorder.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6t_opts.h> MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: IPv6 Hop-By-Hop and Destination Header match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); MODULE_ALIAS("ip6t_dst"); /* * (Type & 0xC0) >> 6 * 0 -> ignorable * 1 -> must drop the packet * 2 -> send ICMP PARM PROB regardless and drop packet * 3 -> Send ICMP if not a multicast address and drop packet * (Type & 0x20) >> 5 * 0 -> invariant * 1 -> can change the routing * (Type & 0x1F) Type * 0 -> Pad1 (only 1 byte!) * 1 -> PadN LENGTH info (total length = length + 2) * C0 | 2 -> JUMBO 4 x x x x ( xxxx > 64k ) * 5 -> RTALERT 2 x x */ static struct xt_match hbh_mt6_reg[] __read_mostly; static bool hbh_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ipv6_opt_hdr _optsh; const struct ipv6_opt_hdr *oh; const struct ip6t_opts *optinfo = par->matchinfo; unsigned int temp; unsigned int ptr = 0; unsigned int hdrlen = 0; bool ret = false; u8 _opttype; u8 _optlen; const u_int8_t *tp = NULL; const u_int8_t *lp = NULL; unsigned int optlen; int err; err = ipv6_find_hdr(skb, &ptr, (par->match == &hbh_mt6_reg[0]) ? NEXTHDR_HOP : NEXTHDR_DEST, NULL, NULL); if (err < 0) { if (err != -ENOENT) par->hotdrop = true; return false; } oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh); if (oh == NULL) { par->hotdrop = true; return false; } hdrlen = ipv6_optlen(oh); if (skb->len - ptr < hdrlen) { /* Packet smaller than it's length field */ return false; } pr_debug("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen); pr_debug("len %02X %04X %02X ", optinfo->hdrlen, hdrlen, (!(optinfo->flags & IP6T_OPTS_LEN) || ((optinfo->hdrlen == hdrlen) ^ !!(optinfo->invflags & IP6T_OPTS_INV_LEN)))); ret = (!(optinfo->flags & IP6T_OPTS_LEN) || ((optinfo->hdrlen == hdrlen) ^ !!(optinfo->invflags & IP6T_OPTS_INV_LEN))); ptr += 2; hdrlen -= 2; if (!(optinfo->flags & IP6T_OPTS_OPTS)) { return ret; } else { pr_debug("Strict "); pr_debug("#%d ", optinfo->optsnr); for (temp = 0; temp < optinfo->optsnr; temp++) { /* type field exists ? */ if (hdrlen < 1) break; tp = skb_header_pointer(skb, ptr, sizeof(_opttype), &_opttype); if (tp == NULL) break; /* Type check */ if (*tp != (optinfo->opts[temp] & 0xFF00) >> 8) { pr_debug("Tbad %02X %02X\n", *tp, (optinfo->opts[temp] & 0xFF00) >> 8); return false; } else { pr_debug("Tok "); } /* Length check */ if (*tp) { u16 spec_len; /* length field exists ? */ if (hdrlen < 2) break; lp = skb_header_pointer(skb, ptr + 1, sizeof(_optlen), &_optlen); if (lp == NULL) break; spec_len = optinfo->opts[temp] & 0x00FF; if (spec_len != 0x00FF && spec_len != *lp) { pr_debug("Lbad %02X %04X\n", *lp, spec_len); return false; } pr_debug("Lok "); optlen = *lp + 2; } else { pr_debug("Pad1\n"); optlen = 1; } /* Step to the next */ pr_debug("len%04X\n", optlen); if ((ptr > skb->len - optlen || hdrlen < optlen) && temp < optinfo->optsnr - 1) { pr_debug("new pointer is too large!\n"); break; } ptr += optlen; hdrlen -= optlen; } if (temp == optinfo->optsnr) return ret; else return false; } return false; } static int hbh_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_opts *optsinfo = par->matchinfo; if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) { pr_debug("unknown flags %X\n", optsinfo->invflags); return -EINVAL; } if (optsinfo->flags & IP6T_OPTS_NSTRICT) { pr_debug("Not strict - not implemented"); return -EINVAL; } return 0; } static struct xt_match hbh_mt6_reg[] __read_mostly = { { /* Note, hbh_mt6 relies on the order of hbh_mt6_reg */ .name = "hbh", .family = NFPROTO_IPV6, .match = hbh_mt6, .matchsize = sizeof(struct ip6t_opts), .checkentry = hbh_mt6_check, .me = THIS_MODULE, }, { .name = "dst", .family = NFPROTO_IPV6, .match = hbh_mt6, .matchsize = sizeof(struct ip6t_opts), .checkentry = hbh_mt6_check, .me = THIS_MODULE, }, }; static int __init hbh_mt6_init(void) { return xt_register_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg)); } static void __exit hbh_mt6_exit(void) { xt_unregister_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg)); } module_init(hbh_mt6_init); module_exit(hbh_mt6_exit);
120 120 1 3 1 119 2 1 116 1 2 1 1 1 1 108 3 8 91 6 2 5 3 89 92 2 1 71 20 11 88 119 2 122 2 2 122 121 2 4 4 39 147 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * super.c */ /* * This file implements code to read the superblock, read and initialise * in-memory structures at mount time, and all the VFS glue code to register * the filesystem. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/vfs.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/init.h> #include <linux/module.h> #include <linux/magic.h> #include <linux/xattr.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" #include "decompressor.h" #include "xattr.h" static struct file_system_type squashfs_fs_type; static const struct super_operations squashfs_super_ops; enum Opt_errors { Opt_errors_continue, Opt_errors_panic, }; enum squashfs_param { Opt_errors, Opt_threads, }; struct squashfs_mount_opts { enum Opt_errors errors; const struct squashfs_decompressor_thread_ops *thread_ops; int thread_num; }; static const struct constant_table squashfs_param_errors[] = { {"continue", Opt_errors_continue }, {"panic", Opt_errors_panic }, {} }; static const struct fs_parameter_spec squashfs_fs_parameters[] = { fsparam_enum("errors", Opt_errors, squashfs_param_errors), fsparam_string("threads", Opt_threads), {} }; static int squashfs_parse_param_threads_str(const char *str, struct squashfs_mount_opts *opts) { #ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT if (strcmp(str, "single") == 0) { opts->thread_ops = &squashfs_decompressor_single; return 0; } if (strcmp(str, "multi") == 0) { opts->thread_ops = &squashfs_decompressor_multi; return 0; } if (strcmp(str, "percpu") == 0) { opts->thread_ops = &squashfs_decompressor_percpu; return 0; } #endif return -EINVAL; } static int squashfs_parse_param_threads_num(const char *str, struct squashfs_mount_opts *opts) { #ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS int ret; unsigned long num; ret = kstrtoul(str, 0, &num); if (ret != 0) return -EINVAL; if (num > 1) { opts->thread_ops = &squashfs_decompressor_multi; if (num > opts->thread_ops->max_decompressors()) return -EINVAL; opts->thread_num = (int)num; return 0; } #ifdef CONFIG_SQUASHFS_DECOMP_SINGLE if (num == 1) { opts->thread_ops = &squashfs_decompressor_single; opts->thread_num = 1; return 0; } #endif #endif /* !CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS */ return -EINVAL; } static int squashfs_parse_param_threads(const char *str, struct squashfs_mount_opts *opts) { int ret = squashfs_parse_param_threads_str(str, opts); if (ret == 0) return ret; return squashfs_parse_param_threads_num(str, opts); } static int squashfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct squashfs_mount_opts *opts = fc->fs_private; struct fs_parse_result result; int opt; opt = fs_parse(fc, squashfs_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_errors: opts->errors = result.uint_32; break; case Opt_threads: if (squashfs_parse_param_threads(param->string, opts) != 0) return -EINVAL; break; default: return -EINVAL; } return 0; } static const struct squashfs_decompressor *supported_squashfs_filesystem( struct fs_context *fc, short major, short minor, short id) { const struct squashfs_decompressor *decompressor; if (major < SQUASHFS_MAJOR) { errorf(fc, "Major/Minor mismatch, older Squashfs %d.%d " "filesystems are unsupported", major, minor); return NULL; } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { errorf(fc, "Major/Minor mismatch, trying to mount newer " "%d.%d filesystem", major, minor); errorf(fc, "Please update your kernel"); return NULL; } decompressor = squashfs_lookup_decompressor(id); if (!decompressor->supported) { errorf(fc, "Filesystem uses \"%s\" compression. This is not supported", decompressor->name); return NULL; } return decompressor; } static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct squashfs_mount_opts *opts = fc->fs_private; struct squashfs_sb_info *msblk; struct squashfs_super_block *sblk = NULL; struct inode *root; long long root_inode; unsigned short flags; unsigned int fragments; u64 lookup_table_start, xattr_id_table_start, next_table; int err; TRACE("Entered squashfs_fill_superblock\n"); sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); if (sb->s_fs_info == NULL) { ERROR("Failed to allocate squashfs_sb_info\n"); return -ENOMEM; } msblk = sb->s_fs_info; msblk->thread_ops = opts->thread_ops; msblk->panic_on_errors = (opts->errors == Opt_errors_panic); msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); msblk->devblksize_log2 = ffz(~msblk->devblksize); mutex_init(&msblk->meta_index_mutex); /* * msblk->bytes_used is checked in squashfs_read_table to ensure reads * are not beyond filesystem end. But as we're using * squashfs_read_table here to read the superblock (including the value * of bytes_used) we need to set it to an initial sensible dummy value */ msblk->bytes_used = sizeof(*sblk); sblk = squashfs_read_table(sb, SQUASHFS_START, sizeof(*sblk)); if (IS_ERR(sblk)) { errorf(fc, "unable to read squashfs_super_block"); err = PTR_ERR(sblk); sblk = NULL; goto failed_mount; } err = -EINVAL; /* Check it is a SQUASHFS superblock */ sb->s_magic = le32_to_cpu(sblk->s_magic); if (sb->s_magic != SQUASHFS_MAGIC) { if (!(fc->sb_flags & SB_SILENT)) errorf(fc, "Can't find a SQUASHFS superblock on %pg", sb->s_bdev); goto failed_mount; } if (opts->thread_num == 0) { msblk->max_thread_num = msblk->thread_ops->max_decompressors(); } else { msblk->max_thread_num = opts->thread_num; } /* Check the MAJOR & MINOR versions and lookup compression type */ msblk->decompressor = supported_squashfs_filesystem( fc, le16_to_cpu(sblk->s_major), le16_to_cpu(sblk->s_minor), le16_to_cpu(sblk->compression)); if (msblk->decompressor == NULL) goto failed_mount; /* Check the filesystem does not extend beyond the end of the block device */ msblk->bytes_used = le64_to_cpu(sblk->bytes_used); if (msblk->bytes_used < 0 || msblk->bytes_used > bdev_nr_bytes(sb->s_bdev)) goto failed_mount; /* Check block size for sanity */ msblk->block_size = le32_to_cpu(sblk->block_size); if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE) goto insanity; /* * Check the system page size is not larger than the filesystem * block size (by default 128K). This is currently not supported. */ if (PAGE_SIZE > msblk->block_size) { errorf(fc, "Page size > filesystem block size (%d). This is " "currently not supported!", msblk->block_size); goto failed_mount; } /* Check block log for sanity */ msblk->block_log = le16_to_cpu(sblk->block_log); if (msblk->block_log > SQUASHFS_FILE_MAX_LOG) goto failed_mount; /* Check that block_size and block_log match */ if (msblk->block_size != (1 << msblk->block_log)) goto insanity; /* Check the root inode for sanity */ root_inode = le64_to_cpu(sblk->root_inode); if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE) goto insanity; msblk->inode_table = le64_to_cpu(sblk->inode_table_start); msblk->directory_table = le64_to_cpu(sblk->directory_table_start); msblk->inodes = le32_to_cpu(sblk->inodes); msblk->fragments = le32_to_cpu(sblk->fragments); msblk->ids = le16_to_cpu(sblk->no_ids); flags = le16_to_cpu(sblk->flags); TRACE("Found valid superblock on %pg\n", sb->s_bdev); TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags) ? "un" : ""); TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags) ? "un" : ""); TRACE("Filesystem size %lld bytes\n", msblk->bytes_used); TRACE("Block size %d\n", msblk->block_size); TRACE("Number of inodes %d\n", msblk->inodes); TRACE("Number of fragments %d\n", msblk->fragments); TRACE("Number of ids %d\n", msblk->ids); TRACE("sblk->inode_table_start %llx\n", msblk->inode_table); TRACE("sblk->directory_table_start %llx\n", msblk->directory_table); TRACE("sblk->fragment_table_start %llx\n", (u64) le64_to_cpu(sblk->fragment_table_start)); TRACE("sblk->id_table_start %llx\n", (u64) le64_to_cpu(sblk->id_table_start)); sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_min = 0; sb->s_time_max = U32_MAX; sb->s_flags |= SB_RDONLY; sb->s_op = &squashfs_super_ops; err = -ENOMEM; msblk->block_cache = squashfs_cache_init("metadata", SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); if (msblk->block_cache == NULL) goto failed_mount; /* Allocate read_page block */ msblk->read_page = squashfs_cache_init("data", msblk->max_thread_num, msblk->block_size); if (msblk->read_page == NULL) { errorf(fc, "Failed to allocate read_page block"); goto failed_mount; } if (msblk->devblksize == PAGE_SIZE) { struct inode *cache = new_inode(sb); if (cache == NULL) goto failed_mount; set_nlink(cache, 1); cache->i_size = OFFSET_MAX; mapping_set_gfp_mask(cache->i_mapping, GFP_NOFS); msblk->cache_mapping = cache->i_mapping; } msblk->stream = squashfs_decompressor_setup(sb, flags); if (IS_ERR(msblk->stream)) { err = PTR_ERR(msblk->stream); msblk->stream = NULL; goto insanity; } /* Handle xattrs */ sb->s_xattr = squashfs_xattr_handlers; xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start); if (xattr_id_table_start == SQUASHFS_INVALID_BLK) { next_table = msblk->bytes_used; goto allocate_id_index_table; } /* Allocate and read xattr id lookup table */ msblk->xattr_id_table = squashfs_read_xattr_id_table(sb, xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids); if (IS_ERR(msblk->xattr_id_table)) { errorf(fc, "unable to read xattr id index table"); err = PTR_ERR(msblk->xattr_id_table); msblk->xattr_id_table = NULL; if (err != -ENOTSUPP) goto failed_mount; } next_table = msblk->xattr_table; allocate_id_index_table: /* Allocate and read id index table */ msblk->id_table = squashfs_read_id_index_table(sb, le64_to_cpu(sblk->id_table_start), next_table, msblk->ids); if (IS_ERR(msblk->id_table)) { errorf(fc, "unable to read id index table"); err = PTR_ERR(msblk->id_table); msblk->id_table = NULL; goto failed_mount; } next_table = le64_to_cpu(msblk->id_table[0]); /* Handle inode lookup table */ lookup_table_start = le64_to_cpu(sblk->lookup_table_start); if (lookup_table_start == SQUASHFS_INVALID_BLK) goto handle_fragments; /* Allocate and read inode lookup table */ msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, lookup_table_start, next_table, msblk->inodes); if (IS_ERR(msblk->inode_lookup_table)) { errorf(fc, "unable to read inode lookup table"); err = PTR_ERR(msblk->inode_lookup_table); msblk->inode_lookup_table = NULL; goto failed_mount; } next_table = le64_to_cpu(msblk->inode_lookup_table[0]); sb->s_export_op = &squashfs_export_ops; handle_fragments: fragments = msblk->fragments; if (fragments == 0) goto check_directory_table; msblk->fragment_cache = squashfs_cache_init("fragment", SQUASHFS_CACHED_FRAGMENTS, msblk->block_size); if (msblk->fragment_cache == NULL) { err = -ENOMEM; goto failed_mount; } /* Allocate and read fragment index table */ msblk->fragment_index = squashfs_read_fragment_index_table(sb, le64_to_cpu(sblk->fragment_table_start), next_table, fragments); if (IS_ERR(msblk->fragment_index)) { errorf(fc, "unable to read fragment index table"); err = PTR_ERR(msblk->fragment_index); msblk->fragment_index = NULL; goto failed_mount; } next_table = le64_to_cpu(msblk->fragment_index[0]); check_directory_table: /* Sanity check directory_table */ if (msblk->directory_table > next_table) { err = -EINVAL; goto insanity; } /* Sanity check inode_table */ if (msblk->inode_table >= msblk->directory_table) { err = -EINVAL; goto insanity; } /* allocate root */ root = new_inode(sb); if (!root) { err = -ENOMEM; goto failed_mount; } err = squashfs_read_inode(root, root_inode); if (err) { make_bad_inode(root); iput(root); goto failed_mount; } insert_inode_hash(root); sb->s_root = d_make_root(root); if (sb->s_root == NULL) { ERROR("Root inode create failed\n"); err = -ENOMEM; goto failed_mount; } TRACE("Leaving squashfs_fill_super\n"); kfree(sblk); return 0; insanity: errorf(fc, "squashfs image failed sanity check"); failed_mount: squashfs_cache_delete(msblk->block_cache); squashfs_cache_delete(msblk->fragment_cache); squashfs_cache_delete(msblk->read_page); if (msblk->cache_mapping) iput(msblk->cache_mapping->host); msblk->thread_ops->destroy(msblk); kfree(msblk->inode_lookup_table); kfree(msblk->fragment_index); kfree(msblk->id_table); kfree(msblk->xattr_id_table); kfree(sb->s_fs_info); sb->s_fs_info = NULL; kfree(sblk); return err; } static int squashfs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, squashfs_fill_super); } static int squashfs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct squashfs_sb_info *msblk = sb->s_fs_info; struct squashfs_mount_opts *opts = fc->fs_private; sync_filesystem(fc->root->d_sb); fc->sb_flags |= SB_RDONLY; msblk->panic_on_errors = (opts->errors == Opt_errors_panic); return 0; } static void squashfs_free_fs_context(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations squashfs_context_ops = { .get_tree = squashfs_get_tree, .free = squashfs_free_fs_context, .parse_param = squashfs_parse_param, .reconfigure = squashfs_reconfigure, }; static int squashfs_show_options(struct seq_file *s, struct dentry *root) { struct super_block *sb = root->d_sb; struct squashfs_sb_info *msblk = sb->s_fs_info; if (msblk->panic_on_errors) seq_puts(s, ",errors=panic"); else seq_puts(s, ",errors=continue"); #ifdef CONFIG_SQUASHFS_CHOICE_DECOMP_BY_MOUNT if (msblk->thread_ops == &squashfs_decompressor_single) { seq_puts(s, ",threads=single"); return 0; } if (msblk->thread_ops == &squashfs_decompressor_percpu) { seq_puts(s, ",threads=percpu"); return 0; } #endif #ifdef CONFIG_SQUASHFS_MOUNT_DECOMP_THREADS seq_printf(s, ",threads=%d", msblk->max_thread_num); #endif return 0; } static int squashfs_init_fs_context(struct fs_context *fc) { struct squashfs_mount_opts *opts; opts = kzalloc(sizeof(*opts), GFP_KERNEL); if (!opts) return -ENOMEM; #ifdef CONFIG_SQUASHFS_DECOMP_SINGLE opts->thread_ops = &squashfs_decompressor_single; #elif defined(CONFIG_SQUASHFS_DECOMP_MULTI) opts->thread_ops = &squashfs_decompressor_multi; #elif defined(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) opts->thread_ops = &squashfs_decompressor_percpu; #else #error "fail: unknown squashfs decompression thread mode?" #endif opts->thread_num = 0; fc->fs_private = opts; fc->ops = &squashfs_context_ops; return 0; } static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info; u64 id = huge_encode_dev(dentry->d_sb->s_bdev->bd_dev); TRACE("Entered squashfs_statfs\n"); buf->f_type = SQUASHFS_MAGIC; buf->f_bsize = msblk->block_size; buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1; buf->f_bfree = buf->f_bavail = 0; buf->f_files = msblk->inodes; buf->f_ffree = 0; buf->f_namelen = SQUASHFS_NAME_LEN; buf->f_fsid = u64_to_fsid(id); return 0; } static void squashfs_put_super(struct super_block *sb) { if (sb->s_fs_info) { struct squashfs_sb_info *sbi = sb->s_fs_info; squashfs_cache_delete(sbi->block_cache); squashfs_cache_delete(sbi->fragment_cache); squashfs_cache_delete(sbi->read_page); if (sbi->cache_mapping) iput(sbi->cache_mapping->host); sbi->thread_ops->destroy(sbi); kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); kfree(sbi->inode_lookup_table); kfree(sbi->xattr_id_table); kfree(sb->s_fs_info); sb->s_fs_info = NULL; } } static struct kmem_cache *squashfs_inode_cachep; static void init_once(void *foo) { struct squashfs_inode_info *ei = foo; inode_init_once(&ei->vfs_inode); } static int __init init_inodecache(void) { squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache", sizeof(struct squashfs_inode_info), 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, init_once); return squashfs_inode_cachep ? 0 : -ENOMEM; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(squashfs_inode_cachep); } static int __init init_squashfs_fs(void) { int err = init_inodecache(); if (err) return err; err = register_filesystem(&squashfs_fs_type); if (err) { destroy_inodecache(); return err; } pr_info("version 4.0 (2009/01/31) Phillip Lougher\n"); return 0; } static void __exit exit_squashfs_fs(void) { unregister_filesystem(&squashfs_fs_type); destroy_inodecache(); } static struct inode *squashfs_alloc_inode(struct super_block *sb) { struct squashfs_inode_info *ei = alloc_inode_sb(sb, squashfs_inode_cachep, GFP_KERNEL); return ei ? &ei->vfs_inode : NULL; } static void squashfs_free_inode(struct inode *inode) { kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode)); } static struct file_system_type squashfs_fs_type = { .owner = THIS_MODULE, .name = "squashfs", .init_fs_context = squashfs_init_fs_context, .parameters = squashfs_fs_parameters, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("squashfs"); static const struct super_operations squashfs_super_ops = { .alloc_inode = squashfs_alloc_inode, .free_inode = squashfs_free_inode, .statfs = squashfs_statfs, .put_super = squashfs_put_super, .show_options = squashfs_show_options, }; module_init(init_squashfs_fs); module_exit(exit_squashfs_fs); MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem"); MODULE_AUTHOR("Phillip Lougher <phillip@squashfs.org.uk>"); MODULE_LICENSE("GPL");
1 1 1 1 3 1 1 5 1 4 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 // SPDX-License-Identifier: GPL-2.0-or-later /* * Roccat Kova[+] driver for Linux * * Copyright (c) 2011 Stefan Achatz <erazor_de@users.sourceforge.net> */ /* */ /* * Roccat Kova[+] is a bigger version of the Pyra with two more side buttons. */ #include <linux/device.h> #include <linux/input.h> #include <linux/hid.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/hid-roccat.h> #include "hid-ids.h" #include "hid-roccat-common.h" #include "hid-roccat-kovaplus.h" static uint profile_numbers[5] = {0, 1, 2, 3, 4}; static uint kovaplus_convert_event_cpi(uint value) { return (value == 7 ? 4 : (value == 4 ? 3 : value)); } static void kovaplus_profile_activated(struct kovaplus_device *kovaplus, uint new_profile_index) { if (new_profile_index >= ARRAY_SIZE(kovaplus->profile_settings)) return; kovaplus->actual_profile = new_profile_index; kovaplus->actual_cpi = kovaplus->profile_settings[new_profile_index].cpi_startup_level; kovaplus->actual_x_sensitivity = kovaplus->profile_settings[new_profile_index].sensitivity_x; kovaplus->actual_y_sensitivity = kovaplus->profile_settings[new_profile_index].sensitivity_y; } static int kovaplus_send_control(struct usb_device *usb_dev, uint value, enum kovaplus_control_requests request) { int retval; struct roccat_common2_control control; if ((request == KOVAPLUS_CONTROL_REQUEST_PROFILE_SETTINGS || request == KOVAPLUS_CONTROL_REQUEST_PROFILE_BUTTONS) && value > 4) return -EINVAL; control.command = ROCCAT_COMMON_COMMAND_CONTROL; control.value = value; control.request = request; retval = roccat_common2_send(usb_dev, ROCCAT_COMMON_COMMAND_CONTROL, &control, sizeof(struct roccat_common2_control)); return retval; } static int kovaplus_select_profile(struct usb_device *usb_dev, uint number, enum kovaplus_control_requests request) { return kovaplus_send_control(usb_dev, number, request); } static int kovaplus_get_profile_settings(struct usb_device *usb_dev, struct kovaplus_profile_settings *buf, uint number) { int retval; retval = kovaplus_select_profile(usb_dev, number, KOVAPLUS_CONTROL_REQUEST_PROFILE_SETTINGS); if (retval) return retval; return roccat_common2_receive(usb_dev, KOVAPLUS_COMMAND_PROFILE_SETTINGS, buf, KOVAPLUS_SIZE_PROFILE_SETTINGS); } static int kovaplus_get_profile_buttons(struct usb_device *usb_dev, struct kovaplus_profile_buttons *buf, int number) { int retval; retval = kovaplus_select_profile(usb_dev, number, KOVAPLUS_CONTROL_REQUEST_PROFILE_BUTTONS); if (retval) return retval; return roccat_common2_receive(usb_dev, KOVAPLUS_COMMAND_PROFILE_BUTTONS, buf, KOVAPLUS_SIZE_PROFILE_BUTTONS); } /* retval is 0-4 on success, < 0 on error */ static int kovaplus_get_actual_profile(struct usb_device *usb_dev) { struct kovaplus_actual_profile buf; int retval; retval = roccat_common2_receive(usb_dev, KOVAPLUS_COMMAND_ACTUAL_PROFILE, &buf, sizeof(struct kovaplus_actual_profile)); return retval ? retval : buf.actual_profile; } static int kovaplus_set_actual_profile(struct usb_device *usb_dev, int new_profile) { struct kovaplus_actual_profile buf; buf.command = KOVAPLUS_COMMAND_ACTUAL_PROFILE; buf.size = sizeof(struct kovaplus_actual_profile); buf.actual_profile = new_profile; return roccat_common2_send_with_status(usb_dev, KOVAPLUS_COMMAND_ACTUAL_PROFILE, &buf, sizeof(struct kovaplus_actual_profile)); } static ssize_t kovaplus_sysfs_read(struct file *fp, struct kobject *kobj, char *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct kovaplus_device *kovaplus = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off >= real_size) return 0; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&kovaplus->kovaplus_lock); retval = roccat_common2_receive(usb_dev, command, buf, real_size); mutex_unlock(&kovaplus->kovaplus_lock); if (retval) return retval; return real_size; } static ssize_t kovaplus_sysfs_write(struct file *fp, struct kobject *kobj, void const *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct kovaplus_device *kovaplus = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&kovaplus->kovaplus_lock); retval = roccat_common2_send_with_status(usb_dev, command, buf, real_size); mutex_unlock(&kovaplus->kovaplus_lock); if (retval) return retval; return real_size; } #define KOVAPLUS_SYSFS_W(thingy, THINGY) \ static ssize_t kovaplus_sysfs_write_ ## thingy(struct file *fp, \ struct kobject *kobj, struct bin_attribute *attr, char *buf, \ loff_t off, size_t count) \ { \ return kovaplus_sysfs_write(fp, kobj, buf, off, count, \ KOVAPLUS_SIZE_ ## THINGY, KOVAPLUS_COMMAND_ ## THINGY); \ } #define KOVAPLUS_SYSFS_R(thingy, THINGY) \ static ssize_t kovaplus_sysfs_read_ ## thingy(struct file *fp, \ struct kobject *kobj, struct bin_attribute *attr, char *buf, \ loff_t off, size_t count) \ { \ return kovaplus_sysfs_read(fp, kobj, buf, off, count, \ KOVAPLUS_SIZE_ ## THINGY, KOVAPLUS_COMMAND_ ## THINGY); \ } #define KOVAPLUS_SYSFS_RW(thingy, THINGY) \ KOVAPLUS_SYSFS_W(thingy, THINGY) \ KOVAPLUS_SYSFS_R(thingy, THINGY) #define KOVAPLUS_BIN_ATTRIBUTE_RW(thingy, THINGY) \ KOVAPLUS_SYSFS_RW(thingy, THINGY); \ static struct bin_attribute bin_attr_##thingy = { \ .attr = { .name = #thingy, .mode = 0660 }, \ .size = KOVAPLUS_SIZE_ ## THINGY, \ .read = kovaplus_sysfs_read_ ## thingy, \ .write = kovaplus_sysfs_write_ ## thingy \ } #define KOVAPLUS_BIN_ATTRIBUTE_W(thingy, THINGY) \ KOVAPLUS_SYSFS_W(thingy, THINGY); \ static struct bin_attribute bin_attr_##thingy = { \ .attr = { .name = #thingy, .mode = 0220 }, \ .size = KOVAPLUS_SIZE_ ## THINGY, \ .write = kovaplus_sysfs_write_ ## thingy \ } KOVAPLUS_BIN_ATTRIBUTE_W(control, CONTROL); KOVAPLUS_BIN_ATTRIBUTE_RW(info, INFO); KOVAPLUS_BIN_ATTRIBUTE_RW(profile_settings, PROFILE_SETTINGS); KOVAPLUS_BIN_ATTRIBUTE_RW(profile_buttons, PROFILE_BUTTONS); static ssize_t kovaplus_sysfs_read_profilex_settings(struct file *fp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); ssize_t retval; retval = kovaplus_select_profile(usb_dev, *(uint *)(attr->private), KOVAPLUS_CONTROL_REQUEST_PROFILE_SETTINGS); if (retval) return retval; return kovaplus_sysfs_read(fp, kobj, buf, off, count, KOVAPLUS_SIZE_PROFILE_SETTINGS, KOVAPLUS_COMMAND_PROFILE_SETTINGS); } static ssize_t kovaplus_sysfs_read_profilex_buttons(struct file *fp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj)->parent->parent; struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); ssize_t retval; retval = kovaplus_select_profile(usb_dev, *(uint *)(attr->private), KOVAPLUS_CONTROL_REQUEST_PROFILE_BUTTONS); if (retval) return retval; return kovaplus_sysfs_read(fp, kobj, buf, off, count, KOVAPLUS_SIZE_PROFILE_BUTTONS, KOVAPLUS_COMMAND_PROFILE_BUTTONS); } #define PROFILE_ATTR(number) \ static struct bin_attribute bin_attr_profile##number##_settings = { \ .attr = { .name = "profile" #number "_settings", .mode = 0440 }, \ .size = KOVAPLUS_SIZE_PROFILE_SETTINGS, \ .read = kovaplus_sysfs_read_profilex_settings, \ .private = &profile_numbers[number-1], \ }; \ static struct bin_attribute bin_attr_profile##number##_buttons = { \ .attr = { .name = "profile" #number "_buttons", .mode = 0440 }, \ .size = KOVAPLUS_SIZE_PROFILE_BUTTONS, \ .read = kovaplus_sysfs_read_profilex_buttons, \ .private = &profile_numbers[number-1], \ }; PROFILE_ATTR(1); PROFILE_ATTR(2); PROFILE_ATTR(3); PROFILE_ATTR(4); PROFILE_ATTR(5); static ssize_t kovaplus_sysfs_show_actual_profile(struct device *dev, struct device_attribute *attr, char *buf) { struct kovaplus_device *kovaplus = hid_get_drvdata(dev_get_drvdata(dev->parent->parent)); return sysfs_emit(buf, "%d\n", kovaplus->actual_profile); } static ssize_t kovaplus_sysfs_set_actual_profile(struct device *dev, struct device_attribute *attr, char const *buf, size_t size) { struct kovaplus_device *kovaplus; struct usb_device *usb_dev; unsigned long profile; int retval; struct kovaplus_roccat_report roccat_report; dev = dev->parent->parent; kovaplus = hid_get_drvdata(dev_get_drvdata(dev)); usb_dev = interface_to_usbdev(to_usb_interface(dev)); retval = kstrtoul(buf, 10, &profile); if (retval) return retval; if (profile >= 5) return -EINVAL; mutex_lock(&kovaplus->kovaplus_lock); retval = kovaplus_set_actual_profile(usb_dev, profile); if (retval) { mutex_unlock(&kovaplus->kovaplus_lock); return retval; } kovaplus_profile_activated(kovaplus, profile); roccat_report.type = KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_PROFILE_1; roccat_report.profile = profile + 1; roccat_report.button = 0; roccat_report.data1 = profile + 1; roccat_report.data2 = 0; roccat_report_event(kovaplus->chrdev_minor, (uint8_t const *)&roccat_report); mutex_unlock(&kovaplus->kovaplus_lock); return size; } static DEVICE_ATTR(actual_profile, 0660, kovaplus_sysfs_show_actual_profile, kovaplus_sysfs_set_actual_profile); static ssize_t kovaplus_sysfs_show_actual_cpi(struct device *dev, struct device_attribute *attr, char *buf) { struct kovaplus_device *kovaplus = hid_get_drvdata(dev_get_drvdata(dev->parent->parent)); return sysfs_emit(buf, "%d\n", kovaplus->actual_cpi); } static DEVICE_ATTR(actual_cpi, 0440, kovaplus_sysfs_show_actual_cpi, NULL); static ssize_t kovaplus_sysfs_show_actual_sensitivity_x(struct device *dev, struct device_attribute *attr, char *buf) { struct kovaplus_device *kovaplus = hid_get_drvdata(dev_get_drvdata(dev->parent->parent)); return sysfs_emit(buf, "%d\n", kovaplus->actual_x_sensitivity); } static DEVICE_ATTR(actual_sensitivity_x, 0440, kovaplus_sysfs_show_actual_sensitivity_x, NULL); static ssize_t kovaplus_sysfs_show_actual_sensitivity_y(struct device *dev, struct device_attribute *attr, char *buf) { struct kovaplus_device *kovaplus = hid_get_drvdata(dev_get_drvdata(dev->parent->parent)); return sysfs_emit(buf, "%d\n", kovaplus->actual_y_sensitivity); } static DEVICE_ATTR(actual_sensitivity_y, 0440, kovaplus_sysfs_show_actual_sensitivity_y, NULL); static ssize_t kovaplus_sysfs_show_firmware_version(struct device *dev, struct device_attribute *attr, char *buf) { struct kovaplus_device *kovaplus; struct usb_device *usb_dev; struct kovaplus_info info; dev = dev->parent->parent; kovaplus = hid_get_drvdata(dev_get_drvdata(dev)); usb_dev = interface_to_usbdev(to_usb_interface(dev)); mutex_lock(&kovaplus->kovaplus_lock); roccat_common2_receive(usb_dev, KOVAPLUS_COMMAND_INFO, &info, KOVAPLUS_SIZE_INFO); mutex_unlock(&kovaplus->kovaplus_lock); return sysfs_emit(buf, "%d\n", info.firmware_version); } static DEVICE_ATTR(firmware_version, 0440, kovaplus_sysfs_show_firmware_version, NULL); static struct attribute *kovaplus_attrs[] = { &dev_attr_actual_cpi.attr, &dev_attr_firmware_version.attr, &dev_attr_actual_profile.attr, &dev_attr_actual_sensitivity_x.attr, &dev_attr_actual_sensitivity_y.attr, NULL, }; static struct bin_attribute *kovaplus_bin_attributes[] = { &bin_attr_control, &bin_attr_info, &bin_attr_profile_settings, &bin_attr_profile_buttons, &bin_attr_profile1_settings, &bin_attr_profile2_settings, &bin_attr_profile3_settings, &bin_attr_profile4_settings, &bin_attr_profile5_settings, &bin_attr_profile1_buttons, &bin_attr_profile2_buttons, &bin_attr_profile3_buttons, &bin_attr_profile4_buttons, &bin_attr_profile5_buttons, NULL, }; static const struct attribute_group kovaplus_group = { .attrs = kovaplus_attrs, .bin_attrs = kovaplus_bin_attributes, }; static const struct attribute_group *kovaplus_groups[] = { &kovaplus_group, NULL, }; static const struct class kovaplus_class = { .name = "kovaplus", .dev_groups = kovaplus_groups, }; static int kovaplus_init_kovaplus_device_struct(struct usb_device *usb_dev, struct kovaplus_device *kovaplus) { int retval, i; static uint wait = 70; /* device will freeze with just 60 */ mutex_init(&kovaplus->kovaplus_lock); for (i = 0; i < 5; ++i) { msleep(wait); retval = kovaplus_get_profile_settings(usb_dev, &kovaplus->profile_settings[i], i); if (retval) return retval; msleep(wait); retval = kovaplus_get_profile_buttons(usb_dev, &kovaplus->profile_buttons[i], i); if (retval) return retval; } msleep(wait); retval = kovaplus_get_actual_profile(usb_dev); if (retval < 0) return retval; kovaplus_profile_activated(kovaplus, retval); return 0; } static int kovaplus_init_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct usb_device *usb_dev = interface_to_usbdev(intf); struct kovaplus_device *kovaplus; int retval; if (intf->cur_altsetting->desc.bInterfaceProtocol == USB_INTERFACE_PROTOCOL_MOUSE) { kovaplus = kzalloc(sizeof(*kovaplus), GFP_KERNEL); if (!kovaplus) { hid_err(hdev, "can't alloc device descriptor\n"); return -ENOMEM; } hid_set_drvdata(hdev, kovaplus); retval = kovaplus_init_kovaplus_device_struct(usb_dev, kovaplus); if (retval) { hid_err(hdev, "couldn't init struct kovaplus_device\n"); goto exit_free; } retval = roccat_connect(&kovaplus_class, hdev, sizeof(struct kovaplus_roccat_report)); if (retval < 0) { hid_err(hdev, "couldn't init char dev\n"); } else { kovaplus->chrdev_minor = retval; kovaplus->roccat_claimed = 1; } } else { hid_set_drvdata(hdev, NULL); } return 0; exit_free: kfree(kovaplus); return retval; } static void kovaplus_remove_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct kovaplus_device *kovaplus; if (intf->cur_altsetting->desc.bInterfaceProtocol == USB_INTERFACE_PROTOCOL_MOUSE) { kovaplus = hid_get_drvdata(hdev); if (kovaplus->roccat_claimed) roccat_disconnect(kovaplus->chrdev_minor); kfree(kovaplus); } } static int kovaplus_probe(struct hid_device *hdev, const struct hid_device_id *id) { int retval; if (!hid_is_usb(hdev)) return -EINVAL; retval = hid_parse(hdev); if (retval) { hid_err(hdev, "parse failed\n"); goto exit; } retval = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (retval) { hid_err(hdev, "hw start failed\n"); goto exit; } retval = kovaplus_init_specials(hdev); if (retval) { hid_err(hdev, "couldn't install mouse\n"); goto exit_stop; } return 0; exit_stop: hid_hw_stop(hdev); exit: return retval; } static void kovaplus_remove(struct hid_device *hdev) { kovaplus_remove_specials(hdev); hid_hw_stop(hdev); } static void kovaplus_keep_values_up_to_date(struct kovaplus_device *kovaplus, u8 const *data) { struct kovaplus_mouse_report_button const *button_report; if (data[0] != KOVAPLUS_MOUSE_REPORT_NUMBER_BUTTON) return; button_report = (struct kovaplus_mouse_report_button const *)data; switch (button_report->type) { case KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_PROFILE_1: kovaplus_profile_activated(kovaplus, button_report->data1 - 1); break; case KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_CPI: kovaplus->actual_cpi = kovaplus_convert_event_cpi(button_report->data1); break; case KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_SENSITIVITY: kovaplus->actual_x_sensitivity = button_report->data1; kovaplus->actual_y_sensitivity = button_report->data2; break; default: break; } } static void kovaplus_report_to_chrdev(struct kovaplus_device const *kovaplus, u8 const *data) { struct kovaplus_roccat_report roccat_report; struct kovaplus_mouse_report_button const *button_report; if (data[0] != KOVAPLUS_MOUSE_REPORT_NUMBER_BUTTON) return; button_report = (struct kovaplus_mouse_report_button const *)data; if (button_report->type == KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_PROFILE_2) return; roccat_report.type = button_report->type; roccat_report.profile = kovaplus->actual_profile + 1; if (roccat_report.type == KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_MACRO || roccat_report.type == KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_SHORTCUT || roccat_report.type == KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_QUICKLAUNCH || roccat_report.type == KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_TIMER) roccat_report.button = button_report->data1; else roccat_report.button = 0; if (roccat_report.type == KOVAPLUS_MOUSE_REPORT_BUTTON_TYPE_CPI) roccat_report.data1 = kovaplus_convert_event_cpi(button_report->data1); else roccat_report.data1 = button_report->data1; roccat_report.data2 = button_report->data2; roccat_report_event(kovaplus->chrdev_minor, (uint8_t const *)&roccat_report); } static int kovaplus_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, int size) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct kovaplus_device *kovaplus = hid_get_drvdata(hdev); if (intf->cur_altsetting->desc.bInterfaceProtocol != USB_INTERFACE_PROTOCOL_MOUSE) return 0; if (kovaplus == NULL) return 0; kovaplus_keep_values_up_to_date(kovaplus, data); if (kovaplus->roccat_claimed) kovaplus_report_to_chrdev(kovaplus, data); return 0; } static const struct hid_device_id kovaplus_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_KOVAPLUS) }, { } }; MODULE_DEVICE_TABLE(hid, kovaplus_devices); static struct hid_driver kovaplus_driver = { .name = "kovaplus", .id_table = kovaplus_devices, .probe = kovaplus_probe, .remove = kovaplus_remove, .raw_event = kovaplus_raw_event }; static int __init kovaplus_init(void) { int retval; retval = class_register(&kovaplus_class); if (retval) return retval; retval = hid_register_driver(&kovaplus_driver); if (retval) class_unregister(&kovaplus_class); return retval; } static void __exit kovaplus_exit(void) { hid_unregister_driver(&kovaplus_driver); class_unregister(&kovaplus_class); } module_init(kovaplus_init); module_exit(kovaplus_exit); MODULE_AUTHOR("Stefan Achatz"); MODULE_DESCRIPTION("USB Roccat Kova[+] driver"); MODULE_LICENSE("GPL v2");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_BMAP_H__ #define __XFS_BMAP_H__ struct getbmap; struct xfs_bmbt_irec; struct xfs_ifork; struct xfs_inode; struct xfs_mount; struct xfs_trans; struct xfs_alloc_arg; /* * Argument structure for xfs_bmap_alloc. */ struct xfs_bmalloca { struct xfs_trans *tp; /* transaction pointer */ struct xfs_inode *ip; /* incore inode pointer */ struct xfs_bmbt_irec prev; /* extent before the new one */ struct xfs_bmbt_irec got; /* extent after, or delayed */ xfs_fileoff_t offset; /* offset in file filling in */ xfs_extlen_t length; /* i/o length asked/allocated */ xfs_fsblock_t blkno; /* starting block of new extent */ struct xfs_btree_cur *cur; /* btree cursor */ struct xfs_iext_cursor icur; /* incore extent cursor */ int nallocs;/* number of extents alloc'd */ int logflags;/* flags for transaction logging */ xfs_extlen_t total; /* total blocks needed for xaction */ xfs_extlen_t minlen; /* minimum allocation size (blocks) */ xfs_extlen_t minleft; /* amount must be left after alloc */ bool eof; /* set if allocating past last extent */ bool wasdel; /* replacing a delayed allocation */ bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ int datatype;/* data type being allocated */ uint32_t flags; }; #define XFS_BMAP_MAX_NMAP 4 /* * Flags for xfs_bmapi_* */ #define XFS_BMAPI_ENTIRE (1u << 0) /* return entire extent untrimmed */ #define XFS_BMAPI_METADATA (1u << 1) /* mapping metadata not user data */ #define XFS_BMAPI_ATTRFORK (1u << 2) /* use attribute fork not data */ #define XFS_BMAPI_PREALLOC (1u << 3) /* preallocating unwritten space */ #define XFS_BMAPI_CONTIG (1u << 4) /* must allocate only one extent */ /* * unwritten extent conversion - this needs write cache flushing and no additional * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts * from written to unwritten, otherwise convert from unwritten to written. */ #define XFS_BMAPI_CONVERT (1u << 5) /* * allocate zeroed extents - this requires all newly allocated user data extents * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set. * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found * during the allocation range to zeroed written extents. */ #define XFS_BMAPI_ZERO (1u << 6) /* * Map the inode offset to the block given in ap->firstblock. Primarily * used for reflink. The range must be in a hole, and this flag cannot be * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork. * * For bunmapi, this flag unmaps the range without adjusting quota, reducing * refcount, or freeing the blocks. */ #define XFS_BMAPI_REMAP (1u << 7) /* Map something in the CoW fork. */ #define XFS_BMAPI_COWFORK (1u << 8) /* Skip online discard of freed extents */ #define XFS_BMAPI_NODISCARD (1u << 9) /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP (1u << 10) #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ { XFS_BMAPI_ATTRFORK, "ATTRFORK" }, \ { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ { XFS_BMAPI_CONVERT, "CONVERT" }, \ { XFS_BMAPI_ZERO, "ZERO" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ { XFS_BMAPI_NORMAP, "NORMAP" } static inline int xfs_bmapi_aflag(int w) { return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0)); } static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags) { if (bmapi_flags & XFS_BMAPI_COWFORK) return XFS_COW_FORK; else if (bmapi_flags & XFS_BMAPI_ATTRFORK) return XFS_ATTR_FORK; return XFS_DATA_FORK; } void xfs_bmap_alloc_account(struct xfs_bmalloca *ap); /* * Special values for xfs_bmbt_irec_t br_startblock field. */ #define DELAYSTARTBLOCK ((xfs_fsblock_t)-1LL) #define HOLESTARTBLOCK ((xfs_fsblock_t)-2LL) /* * Flags for xfs_bmap_add_extent*. */ #define BMAP_LEFT_CONTIG (1u << 0) #define BMAP_RIGHT_CONTIG (1u << 1) #define BMAP_LEFT_FILLING (1u << 2) #define BMAP_RIGHT_FILLING (1u << 3) #define BMAP_LEFT_DELAY (1u << 4) #define BMAP_RIGHT_DELAY (1u << 5) #define BMAP_LEFT_VALID (1u << 6) #define BMAP_RIGHT_VALID (1u << 7) #define BMAP_ATTRFORK (1u << 8) #define BMAP_COWFORK (1u << 9) #define XFS_BMAP_EXT_FLAGS \ { BMAP_LEFT_CONTIG, "LC" }, \ { BMAP_RIGHT_CONTIG, "RC" }, \ { BMAP_LEFT_FILLING, "LF" }, \ { BMAP_RIGHT_FILLING, "RF" }, \ { BMAP_ATTRFORK, "ATTR" }, \ { BMAP_COWFORK, "COW" } /* Return true if the extent is an allocated extent, written or not. */ static inline bool xfs_bmap_is_real_extent(const struct xfs_bmbt_irec *irec) { return irec->br_startblock != HOLESTARTBLOCK && irec->br_startblock != DELAYSTARTBLOCK && !isnullstartblock(irec->br_startblock); } /* * Return true if the extent is a real, allocated extent, or false if it is a * delayed allocation, and unwritten extent or a hole. */ static inline bool xfs_bmap_is_written_extent(const struct xfs_bmbt_irec *irec) { return xfs_bmap_is_real_extent(irec) && irec->br_state != XFS_EXT_UNWRITTEN; } /* * Check the mapping for obviously garbage allocations that could trash the * filesystem immediately. */ #define xfs_valid_startblock(ip, startblock) \ ((startblock) != 0 || XFS_IS_REALTIME_INODE(ip)) int xfs_bmap_longest_free_extent(struct xfs_perag *pag, struct xfs_trans *tp, xfs_extlen_t *blen); void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp); int xfs_bmap_add_attrfork(struct xfs_trans *tp, struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork); int xfs_bmap_local_to_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t total, int *logflagsp, int whichfork, void (*init_fn)(struct xfs_trans *tp, struct xfs_buf *bp, struct xfs_inode *ip, struct xfs_ifork *ifp, void *priv), void *priv); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); int xfs_bmap_last_before(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *last_block, int whichfork); int xfs_bmap_last_offset(struct xfs_inode *ip, xfs_fileoff_t *unused, int whichfork); int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, struct xfs_bmbt_irec *mval, int *nmap, uint32_t flags); int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap); int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags, xfs_extnum_t nexts, int *done); void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done); int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off, xfs_fileoff_t shift); int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, bool *done, xfs_fileoff_t stop_fsb); int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_offset); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, xfs_off_t offset, struct iomap *iomap, unsigned int *seq); int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, int *logflagsp); xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip, int fork); int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap, struct xfs_alloc_arg *args); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, XFS_BMAP_UNMAP, }; #define XFS_BMAP_INTENT_STRINGS \ { XFS_BMAP_MAP, "map" }, \ { XFS_BMAP_UNMAP, "unmap" } struct xfs_bmap_intent { struct list_head bi_list; enum xfs_bmap_intent_type bi_type; int bi_whichfork; struct xfs_inode *bi_owner; struct xfs_group *bi_group; struct xfs_bmbt_irec bi_bmap; }; int xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi); void xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); void xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *imap); static inline uint32_t xfs_bmap_fork_to_state(int whichfork) { switch (whichfork) { case XFS_ATTR_FORK: return BMAP_ATTRFORK; case XFS_COW_FORK: return BMAP_COWFORK; default: return 0; } } xfs_failaddr_t xfs_bmap_validate_extent_raw(struct xfs_mount *mp, bool rtfile, int whichfork, struct xfs_bmbt_irec *irec); xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec); int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork, xfs_failaddr_t fa, const struct xfs_bmbt_irec *irec); int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, uint32_t flags); int xfs_bunmapi_range(struct xfs_trans **tpp, struct xfs_inode *ip, uint32_t flags, xfs_fileoff_t startoff, xfs_fileoff_t endoff); extern struct kmem_cache *xfs_bmap_intent_cache; int __init xfs_bmap_intent_init_cache(void); void xfs_bmap_intent_destroy_cache(void); typedef int (*xfs_bmap_query_range_fn)( struct xfs_btree_cur *cur, struct xfs_bmbt_irec *rec, void *priv); int xfs_bmap_query_all(struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn, void *priv); xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); #endif /* __XFS_BMAP_H__ */
2 2 1 1 1 1 1 1 1 1 1 2 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 // SPDX-License-Identifier: GPL-2.0 /* * Simple file system for zoned block devices exposing zones as files. * * Copyright (C) 2019 Western Digital Corporation or its affiliates. */ #include <linux/module.h> #include <linux/pagemap.h> #include <linux/magic.h> #include <linux/iomap.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/statfs.h> #include <linux/writeback.h> #include <linux/quotaops.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/mman.h> #include <linux/sched/mm.h> #include <linux/crc32.h> #include <linux/task_io_accounting_ops.h> #include <linux/fs_parser.h> #include <linux/fs_context.h> #include "zonefs.h" #define CREATE_TRACE_POINTS #include "trace.h" /* * Get the name of a zone group directory. */ static const char *zonefs_zgroup_name(enum zonefs_ztype ztype) { switch (ztype) { case ZONEFS_ZTYPE_CNV: return "cnv"; case ZONEFS_ZTYPE_SEQ: return "seq"; default: WARN_ON_ONCE(1); return "???"; } } /* * Manage the active zone count. */ static void zonefs_account_active(struct super_block *sb, struct zonefs_zone *z) { struct zonefs_sb_info *sbi = ZONEFS_SB(sb); if (zonefs_zone_is_cnv(z)) return; /* * For zones that transitioned to the offline or readonly condition, * we only need to clear the active state. */ if (z->z_flags & (ZONEFS_ZONE_OFFLINE | ZONEFS_ZONE_READONLY)) goto out; /* * If the zone is active, that is, if it is explicitly open or * partially written, check if it was already accounted as active. */ if ((z->z_flags & ZONEFS_ZONE_OPEN) || (z->z_wpoffset > 0 && z->z_wpoffset < z->z_capacity)) { if (!(z->z_flags & ZONEFS_ZONE_ACTIVE)) { z->z_flags |= ZONEFS_ZONE_ACTIVE; atomic_inc(&sbi->s_active_seq_files); } return; } out: /* The zone is not active. If it was, update the active count */ if (z->z_flags & ZONEFS_ZONE_ACTIVE) { z->z_flags &= ~ZONEFS_ZONE_ACTIVE; atomic_dec(&sbi->s_active_seq_files); } } /* * Manage the active zone count. Called with zi->i_truncate_mutex held. */ void zonefs_inode_account_active(struct inode *inode) { lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); return zonefs_account_active(inode->i_sb, zonefs_inode_zone(inode)); } /* * Execute a zone management operation. */ static int zonefs_zone_mgmt(struct super_block *sb, struct zonefs_zone *z, enum req_op op) { int ret; /* * With ZNS drives, closing an explicitly open zone that has not been * written will change the zone state to "closed", that is, the zone * will remain active. Since this can then cause failure of explicit * open operation on other zones if the drive active zone resources * are exceeded, make sure that the zone does not remain active by * resetting it. */ if (op == REQ_OP_ZONE_CLOSE && !z->z_wpoffset) op = REQ_OP_ZONE_RESET; trace_zonefs_zone_mgmt(sb, z, op); ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector, z->z_size >> SECTOR_SHIFT); if (ret) { zonefs_err(sb, "Zone management operation %s at %llu failed %d\n", blk_op_str(op), z->z_sector, ret); return ret; } return 0; } int zonefs_inode_zone_mgmt(struct inode *inode, enum req_op op) { lockdep_assert_held(&ZONEFS_I(inode)->i_truncate_mutex); return zonefs_zone_mgmt(inode->i_sb, zonefs_inode_zone(inode), op); } void zonefs_i_size_write(struct inode *inode, loff_t isize) { struct zonefs_zone *z = zonefs_inode_zone(inode); i_size_write(inode, isize); /* * A full zone is no longer open/active and does not need * explicit closing. */ if (isize >= z->z_capacity) { struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); if (z->z_flags & ZONEFS_ZONE_ACTIVE) atomic_dec(&sbi->s_active_seq_files); z->z_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); } } void zonefs_update_stats(struct inode *inode, loff_t new_isize) { struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); loff_t old_isize = i_size_read(inode); loff_t nr_blocks; if (new_isize == old_isize) return; spin_lock(&sbi->s_lock); /* * This may be called for an update after an IO error. * So beware of the values seen. */ if (new_isize < old_isize) { nr_blocks = (old_isize - new_isize) >> sb->s_blocksize_bits; if (sbi->s_used_blocks > nr_blocks) sbi->s_used_blocks -= nr_blocks; else sbi->s_used_blocks = 0; } else { sbi->s_used_blocks += (new_isize - old_isize) >> sb->s_blocksize_bits; if (sbi->s_used_blocks > sbi->s_blocks) sbi->s_used_blocks = sbi->s_blocks; } spin_unlock(&sbi->s_lock); } /* * Check a zone condition. Return the amount of written (and still readable) * data in the zone. */ static loff_t zonefs_check_zone_condition(struct super_block *sb, struct zonefs_zone *z, struct blk_zone *zone) { switch (zone->cond) { case BLK_ZONE_COND_OFFLINE: zonefs_warn(sb, "Zone %llu: offline zone\n", z->z_sector); z->z_flags |= ZONEFS_ZONE_OFFLINE; return 0; case BLK_ZONE_COND_READONLY: /* * The write pointer of read-only zones is invalid, so we cannot * determine the zone wpoffset (inode size). We thus keep the * zone wpoffset as is, which leads to an empty file * (wpoffset == 0) on mount. For a runtime error, this keeps * the inode size as it was when last updated so that the user * can recover data. */ zonefs_warn(sb, "Zone %llu: read-only zone\n", z->z_sector); z->z_flags |= ZONEFS_ZONE_READONLY; if (zonefs_zone_is_cnv(z)) return z->z_capacity; return z->z_wpoffset; case BLK_ZONE_COND_FULL: /* The write pointer of full zones is invalid. */ return z->z_capacity; default: if (zonefs_zone_is_cnv(z)) return z->z_capacity; return (zone->wp - zone->start) << SECTOR_SHIFT; } } /* * Check a zone condition and adjust its inode access permissions for * offline and readonly zones. */ static void zonefs_inode_update_mode(struct inode *inode) { struct zonefs_zone *z = zonefs_inode_zone(inode); if (z->z_flags & ZONEFS_ZONE_OFFLINE) { /* Offline zones cannot be read nor written */ inode->i_flags |= S_IMMUTABLE; inode->i_mode &= ~0777; } else if (z->z_flags & ZONEFS_ZONE_READONLY) { /* Readonly zones cannot be written */ inode->i_flags |= S_IMMUTABLE; if (z->z_flags & ZONEFS_ZONE_INIT_MODE) inode->i_mode &= ~0777; else inode->i_mode &= ~0222; } z->z_flags &= ~ZONEFS_ZONE_INIT_MODE; z->z_mode = inode->i_mode; } static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct blk_zone *z = data; *z = *zone; return 0; } static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone, bool write) { struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); loff_t isize, data_size; /* * Check the zone condition: if the zone is not "bad" (offline or * read-only), read errors are simply signaled to the IO issuer as long * as there is no inconsistency between the inode size and the amount of * data writen in the zone (data_size). */ data_size = zonefs_check_zone_condition(sb, z, zone); isize = i_size_read(inode); if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) && !write && isize == data_size) return; /* * At this point, we detected either a bad zone or an inconsistency * between the inode size and the amount of data written in the zone. * For the latter case, the cause may be a write IO error or an external * action on the device. Two error patterns exist: * 1) The inode size is lower than the amount of data in the zone: * a write operation partially failed and data was writen at the end * of the file. This can happen in the case of a large direct IO * needing several BIOs and/or write requests to be processed. * 2) The inode size is larger than the amount of data in the zone: * this can happen with a deferred write error with the use of the * device side write cache after getting successful write IO * completions. Other possibilities are (a) an external corruption, * e.g. an application reset the zone directly, or (b) the device * has a serious problem (e.g. firmware bug). * * In all cases, warn about inode size inconsistency and handle the * IO error according to the zone condition and to the mount options. */ if (isize != data_size) zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", inode->i_ino, isize, data_size); /* * First handle bad zones signaled by hardware. The mount options * errors=zone-ro and errors=zone-offline result in changing the * zone condition to read-only and offline respectively, as if the * condition was signaled by the hardware. */ if ((z->z_flags & ZONEFS_ZONE_OFFLINE) || (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL)) { zonefs_warn(sb, "inode %lu: read/write access disabled\n", inode->i_ino); if (!(z->z_flags & ZONEFS_ZONE_OFFLINE)) z->z_flags |= ZONEFS_ZONE_OFFLINE; zonefs_inode_update_mode(inode); data_size = 0; } else if ((z->z_flags & ZONEFS_ZONE_READONLY) || (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO)) { zonefs_warn(sb, "inode %lu: write access disabled\n", inode->i_ino); if (!(z->z_flags & ZONEFS_ZONE_READONLY)) z->z_flags |= ZONEFS_ZONE_READONLY; zonefs_inode_update_mode(inode); data_size = isize; } else if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO && data_size > isize) { /* Do not expose garbage data */ data_size = isize; } /* * If the filesystem is mounted with the explicit-open mount option, we * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to * the read-only or offline condition, to avoid attempting an explicit * close of the zone when the inode file is closed. */ if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && (z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE))) z->z_flags &= ~ZONEFS_ZONE_OPEN; /* * If error=remount-ro was specified, any error result in remounting * the volume as read-only. */ if ((sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) && !sb_rdonly(sb)) { zonefs_warn(sb, "remounting filesystem read-only\n"); sb->s_flags |= SB_RDONLY; } /* * Update block usage stats and the inode size to prevent access to * invalid data. */ zonefs_update_stats(inode, data_size); zonefs_i_size_write(inode, data_size); z->z_wpoffset = data_size; zonefs_inode_account_active(inode); } /* * When an file IO error occurs, check the file zone to see if there is a change * in the zone condition (e.g. offline or read-only). For a failed write to a * sequential zone, the zone write pointer position must also be checked to * eventually correct the file size and zonefs inode write pointer offset * (which can be out of sync with the drive due to partial write failures). */ void __zonefs_io_error(struct inode *inode, bool write) { struct zonefs_zone *z = zonefs_inode_zone(inode); struct super_block *sb = inode->i_sb; unsigned int noio_flag; struct blk_zone zone; int ret; /* * Conventional zone have no write pointer and cannot become read-only * or offline. So simply fake a report for a single or aggregated zone * and let zonefs_handle_io_error() correct the zone inode information * according to the mount options. */ if (!zonefs_zone_is_seq(z)) { zone.start = z->z_sector; zone.len = z->z_size >> SECTOR_SHIFT; zone.wp = zone.start + zone.len; zone.type = BLK_ZONE_TYPE_CONVENTIONAL; zone.cond = BLK_ZONE_COND_NOT_WP; zone.capacity = zone.len; goto handle_io_error; } /* * Memory allocations in blkdev_report_zones() can trigger a memory * reclaim which may in turn cause a recursion into zonefs as well as * struct request allocations for the same device. The former case may * end up in a deadlock on the inode truncate mutex, while the latter * may prevent IO forward progress. Executing the report zones under * the GFP_NOIO context avoids both problems. */ noio_flag = memalloc_noio_save(); ret = blkdev_report_zones(sb->s_bdev, z->z_sector, 1, zonefs_io_error_cb, &zone); memalloc_noio_restore(noio_flag); if (ret != 1) { zonefs_err(sb, "Get inode %lu zone information failed %d\n", inode->i_ino, ret); zonefs_warn(sb, "remounting filesystem read-only\n"); sb->s_flags |= SB_RDONLY; return; } handle_io_error: zonefs_handle_io_error(inode, &zone, write); } static struct kmem_cache *zonefs_inode_cachep; static struct inode *zonefs_alloc_inode(struct super_block *sb) { struct zonefs_inode_info *zi; zi = alloc_inode_sb(sb, zonefs_inode_cachep, GFP_KERNEL); if (!zi) return NULL; inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); zi->i_wr_refcnt = 0; return &zi->i_vnode; } static void zonefs_free_inode(struct inode *inode) { kmem_cache_free(zonefs_inode_cachep, ZONEFS_I(inode)); } /* * File system stat. */ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); enum zonefs_ztype t; buf->f_type = ZONEFS_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_namelen = ZONEFS_NAME_MAX; spin_lock(&sbi->s_lock); buf->f_blocks = sbi->s_blocks; if (WARN_ON(sbi->s_used_blocks > sbi->s_blocks)) buf->f_bfree = 0; else buf->f_bfree = buf->f_blocks - sbi->s_used_blocks; buf->f_bavail = buf->f_bfree; for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { if (sbi->s_zgroup[t].g_nr_zones) buf->f_files += sbi->s_zgroup[t].g_nr_zones + 1; } buf->f_ffree = 0; spin_unlock(&sbi->s_lock); buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b); return 0; } enum { Opt_errors, Opt_explicit_open, }; struct zonefs_context { unsigned long s_mount_opts; }; static const struct constant_table zonefs_param_errors[] = { {"remount-ro", ZONEFS_MNTOPT_ERRORS_RO}, {"zone-ro", ZONEFS_MNTOPT_ERRORS_ZRO}, {"zone-offline", ZONEFS_MNTOPT_ERRORS_ZOL}, {"repair", ZONEFS_MNTOPT_ERRORS_REPAIR}, {} }; static const struct fs_parameter_spec zonefs_param_spec[] = { fsparam_enum ("errors", Opt_errors, zonefs_param_errors), fsparam_flag ("explicit-open", Opt_explicit_open), {} }; static int zonefs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct zonefs_context *ctx = fc->fs_private; struct fs_parse_result result; int opt; opt = fs_parse(fc, zonefs_param_spec, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_errors: ctx->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; ctx->s_mount_opts |= result.uint_32; break; case Opt_explicit_open: ctx->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; break; default: return -EINVAL; } return 0; } static int zonefs_show_options(struct seq_file *seq, struct dentry *root) { struct zonefs_sb_info *sbi = ZONEFS_SB(root->d_sb); if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) seq_puts(seq, ",errors=remount-ro"); if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) seq_puts(seq, ",errors=zone-ro"); if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) seq_puts(seq, ",errors=zone-offline"); if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_REPAIR) seq_puts(seq, ",errors=repair"); return 0; } static int zonefs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int ret; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; ret = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (ret) return ret; /* * Since files and directories cannot be created nor deleted, do not * allow setting any write attributes on the sub-directories grouping * files by zone type. */ if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && (iattr->ia_mode & 0222)) return -EPERM; if (((iattr->ia_valid & ATTR_UID) && !uid_eq(iattr->ia_uid, inode->i_uid)) || ((iattr->ia_valid & ATTR_GID) && !gid_eq(iattr->ia_gid, inode->i_gid))) { ret = dquot_transfer(&nop_mnt_idmap, inode, iattr); if (ret) return ret; } if (iattr->ia_valid & ATTR_SIZE) { ret = zonefs_file_truncate(inode, iattr->ia_size); if (ret) return ret; } setattr_copy(&nop_mnt_idmap, inode, iattr); if (S_ISREG(inode->i_mode)) { struct zonefs_zone *z = zonefs_inode_zone(inode); z->z_mode = inode->i_mode; z->z_uid = inode->i_uid; z->z_gid = inode->i_gid; } return 0; } static const struct inode_operations zonefs_file_inode_operations = { .setattr = zonefs_inode_setattr, }; static long zonefs_fname_to_fno(const struct qstr *fname) { const char *name = fname->name; unsigned int len = fname->len; long fno = 0, shift = 1; const char *rname; char c = *name; unsigned int i; /* * File names are always a base-10 number string without any * leading 0s. */ if (!isdigit(c)) return -ENOENT; if (len > 1 && c == '0') return -ENOENT; if (len == 1) return c - '0'; for (i = 0, rname = name + len - 1; i < len; i++, rname--) { c = *rname; if (!isdigit(c)) return -ENOENT; fno += (c - '0') * shift; shift *= 10; } return fno; } static struct inode *zonefs_get_file_inode(struct inode *dir, struct dentry *dentry) { struct zonefs_zone_group *zgroup = dir->i_private; struct super_block *sb = dir->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); struct zonefs_zone *z; struct inode *inode; ino_t ino; long fno; /* Get the file number from the file name */ fno = zonefs_fname_to_fno(&dentry->d_name); if (fno < 0) return ERR_PTR(fno); if (!zgroup->g_nr_zones || fno >= zgroup->g_nr_zones) return ERR_PTR(-ENOENT); z = &zgroup->g_zones[fno]; ino = z->z_sector >> sbi->s_zone_sectors_shift; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { WARN_ON_ONCE(inode->i_private != z); return inode; } inode->i_ino = ino; inode->i_mode = z->z_mode; inode_set_mtime_to_ts(inode, inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, inode_get_ctime(dir)))); inode->i_uid = z->z_uid; inode->i_gid = z->z_gid; inode->i_size = z->z_wpoffset; inode->i_blocks = z->z_capacity >> SECTOR_SHIFT; inode->i_private = z; inode->i_op = &zonefs_file_inode_operations; inode->i_fop = &zonefs_file_operations; inode->i_mapping->a_ops = &zonefs_file_aops; mapping_set_large_folios(inode->i_mapping); /* Update the inode access rights depending on the zone condition */ zonefs_inode_update_mode(inode); unlock_new_inode(inode); return inode; } static struct inode *zonefs_get_zgroup_inode(struct super_block *sb, enum zonefs_ztype ztype) { struct inode *root = d_inode(sb->s_root); struct zonefs_sb_info *sbi = ZONEFS_SB(sb); struct inode *inode; ino_t ino = bdev_nr_zones(sb->s_bdev) + ztype + 1; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; inode->i_ino = ino; inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555); inode->i_size = sbi->s_zgroup[ztype].g_nr_zones; inode_set_mtime_to_ts(inode, inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, inode_get_ctime(root)))); inode->i_private = &sbi->s_zgroup[ztype]; set_nlink(inode, 2); inode->i_op = &zonefs_dir_inode_operations; inode->i_fop = &zonefs_dir_operations; unlock_new_inode(inode); return inode; } static struct inode *zonefs_get_dir_inode(struct inode *dir, struct dentry *dentry) { struct super_block *sb = dir->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); const char *name = dentry->d_name.name; enum zonefs_ztype ztype; /* * We only need to check for the "seq" directory and * the "cnv" directory if we have conventional zones. */ if (dentry->d_name.len != 3) return ERR_PTR(-ENOENT); for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { if (sbi->s_zgroup[ztype].g_nr_zones && memcmp(name, zonefs_zgroup_name(ztype), 3) == 0) break; } if (ztype == ZONEFS_ZTYPE_MAX) return ERR_PTR(-ENOENT); return zonefs_get_zgroup_inode(sb, ztype); } static struct dentry *zonefs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode; if (dentry->d_name.len > ZONEFS_NAME_MAX) return ERR_PTR(-ENAMETOOLONG); if (dir == d_inode(dir->i_sb->s_root)) inode = zonefs_get_dir_inode(dir, dentry); else inode = zonefs_get_file_inode(dir, dentry); return d_splice_alias(inode, dentry); } static int zonefs_readdir_root(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); enum zonefs_ztype ztype = ZONEFS_ZTYPE_CNV; ino_t base_ino = bdev_nr_zones(sb->s_bdev) + 1; if (ctx->pos >= inode->i_size) return 0; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) { if (!sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) ztype = ZONEFS_ZTYPE_SEQ; if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3, base_ino + ztype, DT_DIR)) return 0; ctx->pos++; } if (ctx->pos == 3 && ztype != ZONEFS_ZTYPE_SEQ) { ztype = ZONEFS_ZTYPE_SEQ; if (!dir_emit(ctx, zonefs_zgroup_name(ztype), 3, base_ino + ztype, DT_DIR)) return 0; ctx->pos++; } return 0; } static int zonefs_readdir_zgroup(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct zonefs_zone_group *zgroup = inode->i_private; struct super_block *sb = inode->i_sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); struct zonefs_zone *z; int fname_len; char *fname; ino_t ino; int f; /* * The size of zone group directories is equal to the number * of zone files in the group and does note include the "." and * ".." entries. Hence the "+ 2" here. */ if (ctx->pos >= inode->i_size + 2) return 0; if (!dir_emit_dots(file, ctx)) return 0; fname = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL); if (!fname) return -ENOMEM; for (f = ctx->pos - 2; f < zgroup->g_nr_zones; f++) { z = &zgroup->g_zones[f]; ino = z->z_sector >> sbi->s_zone_sectors_shift; fname_len = snprintf(fname, ZONEFS_NAME_MAX - 1, "%u", f); if (!dir_emit(ctx, fname, fname_len, ino, DT_REG)) break; ctx->pos++; } kfree(fname); return 0; } static int zonefs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); if (inode == d_inode(inode->i_sb->s_root)) return zonefs_readdir_root(file, ctx); return zonefs_readdir_zgroup(file, ctx); } const struct inode_operations zonefs_dir_inode_operations = { .lookup = zonefs_lookup, .setattr = zonefs_inode_setattr, }; const struct file_operations zonefs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = zonefs_readdir, }; struct zonefs_zone_data { struct super_block *sb; unsigned int nr_zones[ZONEFS_ZTYPE_MAX]; sector_t cnv_zone_start; struct blk_zone *zones; }; static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct zonefs_zone_data *zd = data; struct super_block *sb = zd->sb; struct zonefs_sb_info *sbi = ZONEFS_SB(sb); /* * We do not care about the first zone: it contains the super block * and not exposed as a file. */ if (!idx) return 0; /* * Count the number of zones that will be exposed as files. * For sequential zones, we always have as many files as zones. * FOr conventional zones, the number of files depends on if we have * conventional zones aggregation enabled. */ switch (zone->type) { case BLK_ZONE_TYPE_CONVENTIONAL: if (sbi->s_features & ZONEFS_F_AGGRCNV) { /* One file per set of contiguous conventional zones */ if (!(sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones) || zone->start != zd->cnv_zone_start) sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; zd->cnv_zone_start = zone->start + zone->len; } else { /* One file per zone */ sbi->s_zgroup[ZONEFS_ZTYPE_CNV].g_nr_zones++; } break; case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: sbi->s_zgroup[ZONEFS_ZTYPE_SEQ].g_nr_zones++; break; default: zonefs_err(zd->sb, "Unsupported zone type 0x%x\n", zone->type); return -EIO; } memcpy(&zd->zones[idx], zone, sizeof(struct blk_zone)); return 0; } static int zonefs_get_zone_info(struct zonefs_zone_data *zd) { struct block_device *bdev = zd->sb->s_bdev; int ret; zd->zones = kvcalloc(bdev_nr_zones(bdev), sizeof(struct blk_zone), GFP_KERNEL); if (!zd->zones) return -ENOMEM; /* Get zones information from the device */ ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, zonefs_get_zone_info_cb, zd); if (ret < 0) { zonefs_err(zd->sb, "Zone report failed %d\n", ret); return ret; } if (ret != bdev_nr_zones(bdev)) { zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n", ret, bdev_nr_zones(bdev)); return -EIO; } return 0; } static inline void zonefs_free_zone_info(struct zonefs_zone_data *zd) { kvfree(zd->zones); } /* * Create a zone group and populate it with zone files. */ static int zonefs_init_zgroup(struct super_block *sb, struct zonefs_zone_data *zd, enum zonefs_ztype ztype) { struct zonefs_sb_info *sbi = ZONEFS_SB(sb); struct zonefs_zone_group *zgroup = &sbi->s_zgroup[ztype]; struct blk_zone *zone, *next, *end; struct zonefs_zone *z; unsigned int n = 0; int ret; /* Allocate the zone group. If it is empty, we have nothing to do. */ if (!zgroup->g_nr_zones) return 0; zgroup->g_zones = kvcalloc(zgroup->g_nr_zones, sizeof(struct zonefs_zone), GFP_KERNEL); if (!zgroup->g_zones) return -ENOMEM; /* * Initialize the zone groups using the device zone information. * We always skip the first zone as it contains the super block * and is not use to back a file. */ end = zd->zones + bdev_nr_zones(sb->s_bdev); for (zone = &zd->zones[1]; zone < end; zone = next) { next = zone + 1; if (zonefs_zone_type(zone) != ztype) continue; if (WARN_ON_ONCE(n >= zgroup->g_nr_zones)) return -EINVAL; /* * For conventional zones, contiguous zones can be aggregated * together to form larger files. Note that this overwrites the * length of the first zone of the set of contiguous zones * aggregated together. If one offline or read-only zone is * found, assume that all zones aggregated have the same * condition. */ if (ztype == ZONEFS_ZTYPE_CNV && (sbi->s_features & ZONEFS_F_AGGRCNV)) { for (; next < end; next++) { if (zonefs_zone_type(next) != ztype) break; zone->len += next->len; zone->capacity += next->capacity; if (next->cond == BLK_ZONE_COND_READONLY && zone->cond != BLK_ZONE_COND_OFFLINE) zone->cond = BLK_ZONE_COND_READONLY; else if (next->cond == BLK_ZONE_COND_OFFLINE) zone->cond = BLK_ZONE_COND_OFFLINE; } } z = &zgroup->g_zones[n]; if (ztype == ZONEFS_ZTYPE_CNV) z->z_flags |= ZONEFS_ZONE_CNV; z->z_sector = zone->start; z->z_size = zone->len << SECTOR_SHIFT; if (z->z_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT && !(sbi->s_features & ZONEFS_F_AGGRCNV)) { zonefs_err(sb, "Invalid zone size %llu (device zone sectors %llu)\n", z->z_size, bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT); return -EINVAL; } z->z_capacity = min_t(loff_t, MAX_LFS_FILESIZE, zone->capacity << SECTOR_SHIFT); z->z_wpoffset = zonefs_check_zone_condition(sb, z, zone); z->z_mode = S_IFREG | sbi->s_perm; z->z_uid = sbi->s_uid; z->z_gid = sbi->s_gid; /* * Let zonefs_inode_update_mode() know that we will need * special initialization of the inode mode the first time * it is accessed. */ z->z_flags |= ZONEFS_ZONE_INIT_MODE; sb->s_maxbytes = max(z->z_capacity, sb->s_maxbytes); sbi->s_blocks += z->z_capacity >> sb->s_blocksize_bits; sbi->s_used_blocks += z->z_wpoffset >> sb->s_blocksize_bits; /* * For sequential zones, make sure that any open zone is closed * first to ensure that the initial number of open zones is 0, * in sync with the open zone accounting done when the mount * option ZONEFS_MNTOPT_EXPLICIT_OPEN is used. */ if (ztype == ZONEFS_ZTYPE_SEQ && (zone->cond == BLK_ZONE_COND_IMP_OPEN || zone->cond == BLK_ZONE_COND_EXP_OPEN)) { ret = zonefs_zone_mgmt(sb, z, REQ_OP_ZONE_CLOSE); if (ret) return ret; } zonefs_account_active(sb, z); n++; } if (WARN_ON_ONCE(n != zgroup->g_nr_zones)) return -EINVAL; zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", zonefs_zgroup_name(ztype), zgroup->g_nr_zones, str_plural(zgroup->g_nr_zones)); return 0; } static void zonefs_free_zgroups(struct super_block *sb) { struct zonefs_sb_info *sbi = ZONEFS_SB(sb); enum zonefs_ztype ztype; if (!sbi) return; for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { kvfree(sbi->s_zgroup[ztype].g_zones); sbi->s_zgroup[ztype].g_zones = NULL; } } /* * Create a zone group and populate it with zone files. */ static int zonefs_init_zgroups(struct super_block *sb) { struct zonefs_zone_data zd; enum zonefs_ztype ztype; int ret; /* First get the device zone information */ memset(&zd, 0, sizeof(struct zonefs_zone_data)); zd.sb = sb; ret = zonefs_get_zone_info(&zd); if (ret) goto cleanup; /* Allocate and initialize the zone groups */ for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { ret = zonefs_init_zgroup(sb, &zd, ztype); if (ret) { zonefs_info(sb, "Zone group \"%s\" initialization failed\n", zonefs_zgroup_name(ztype)); break; } } cleanup: zonefs_free_zone_info(&zd); if (ret) zonefs_free_zgroups(sb); return ret; } /* * Read super block information from the device. */ static int zonefs_read_super(struct super_block *sb) { struct zonefs_sb_info *sbi = ZONEFS_SB(sb); struct zonefs_super *super; u32 crc, stored_crc; struct page *page; struct bio_vec bio_vec; struct bio bio; int ret; page = alloc_page(GFP_KERNEL); if (!page) return -ENOMEM; bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ); bio.bi_iter.bi_sector = 0; __bio_add_page(&bio, page, PAGE_SIZE, 0); ret = submit_bio_wait(&bio); if (ret) goto free_page; super = page_address(page); ret = -EINVAL; if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC) goto free_page; stored_crc = le32_to_cpu(super->s_crc); super->s_crc = 0; crc = crc32(~0U, (unsigned char *)super, sizeof(struct zonefs_super)); if (crc != stored_crc) { zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)", crc, stored_crc); goto free_page; } sbi->s_features = le64_to_cpu(super->s_features); if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) { zonefs_err(sb, "Unknown features set 0x%llx\n", sbi->s_features); goto free_page; } if (sbi->s_features & ZONEFS_F_UID) { sbi->s_uid = make_kuid(current_user_ns(), le32_to_cpu(super->s_uid)); if (!uid_valid(sbi->s_uid)) { zonefs_err(sb, "Invalid UID feature\n"); goto free_page; } } if (sbi->s_features & ZONEFS_F_GID) { sbi->s_gid = make_kgid(current_user_ns(), le32_to_cpu(super->s_gid)); if (!gid_valid(sbi->s_gid)) { zonefs_err(sb, "Invalid GID feature\n"); goto free_page; } } if (sbi->s_features & ZONEFS_F_PERM) sbi->s_perm = le32_to_cpu(super->s_perm); if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) { zonefs_err(sb, "Reserved area is being used\n"); goto free_page; } import_uuid(&sbi->s_uuid, super->s_uuid); ret = 0; free_page: __free_page(page); return ret; } static const struct super_operations zonefs_sops = { .alloc_inode = zonefs_alloc_inode, .free_inode = zonefs_free_inode, .statfs = zonefs_statfs, .show_options = zonefs_show_options, }; static int zonefs_get_zgroup_inodes(struct super_block *sb) { struct zonefs_sb_info *sbi = ZONEFS_SB(sb); struct inode *dir_inode; enum zonefs_ztype ztype; for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { if (!sbi->s_zgroup[ztype].g_nr_zones) continue; dir_inode = zonefs_get_zgroup_inode(sb, ztype); if (IS_ERR(dir_inode)) return PTR_ERR(dir_inode); sbi->s_zgroup[ztype].g_inode = dir_inode; } return 0; } static void zonefs_release_zgroup_inodes(struct super_block *sb) { struct zonefs_sb_info *sbi = ZONEFS_SB(sb); enum zonefs_ztype ztype; if (!sbi) return; for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { if (sbi->s_zgroup[ztype].g_inode) { iput(sbi->s_zgroup[ztype].g_inode); sbi->s_zgroup[ztype].g_inode = NULL; } } } /* * Check that the device is zoned. If it is, get the list of zones and create * sub-directories and files according to the device zone configuration and * format options. */ static int zonefs_fill_super(struct super_block *sb, struct fs_context *fc) { struct zonefs_sb_info *sbi; struct zonefs_context *ctx = fc->fs_private; struct inode *inode; enum zonefs_ztype ztype; int ret; if (!bdev_is_zoned(sb->s_bdev)) { zonefs_err(sb, "Not a zoned block device\n"); return -EINVAL; } /* * Initialize super block information: the maximum file size is updated * when the zone files are created so that the format option * ZONEFS_F_AGGRCNV which increases the maximum file size of a file * beyond the zone size is taken into account. */ sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; spin_lock_init(&sbi->s_lock); sb->s_fs_info = sbi; sb->s_magic = ZONEFS_MAGIC; sb->s_maxbytes = 0; sb->s_op = &zonefs_sops; sb->s_time_gran = 1; /* * The block size is set to the device zone write granularity to ensure * that write operations are always aligned according to the device * interface constraints. */ sb_set_blocksize(sb, bdev_zone_write_granularity(sb->s_bdev)); sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev)); sbi->s_uid = GLOBAL_ROOT_UID; sbi->s_gid = GLOBAL_ROOT_GID; sbi->s_perm = 0640; sbi->s_mount_opts = ctx->s_mount_opts; atomic_set(&sbi->s_wro_seq_files, 0); sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev); atomic_set(&sbi->s_active_seq_files, 0); sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev); ret = zonefs_read_super(sb); if (ret) return ret; zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev)); if (!sbi->s_max_wro_seq_files && !sbi->s_max_active_seq_files && sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { zonefs_info(sb, "No open and active zone limits. Ignoring explicit_open mount option\n"); sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; } /* Initialize the zone groups */ ret = zonefs_init_zgroups(sb); if (ret) goto cleanup; /* Create the root directory inode */ ret = -ENOMEM; inode = new_inode(sb); if (!inode) goto cleanup; inode->i_ino = bdev_nr_zones(sb->s_bdev); inode->i_mode = S_IFDIR | 0555; simple_inode_init_ts(inode); inode->i_op = &zonefs_dir_inode_operations; inode->i_fop = &zonefs_dir_operations; inode->i_size = 2; set_nlink(inode, 2); for (ztype = 0; ztype < ZONEFS_ZTYPE_MAX; ztype++) { if (sbi->s_zgroup[ztype].g_nr_zones) { inc_nlink(inode); inode->i_size++; } } sb->s_root = d_make_root(inode); if (!sb->s_root) goto cleanup; /* * Take a reference on the zone groups directory inodes * to keep them in the inode cache. */ ret = zonefs_get_zgroup_inodes(sb); if (ret) goto cleanup; ret = zonefs_sysfs_register(sb); if (ret) goto cleanup; return 0; cleanup: zonefs_release_zgroup_inodes(sb); zonefs_free_zgroups(sb); return ret; } static void zonefs_kill_super(struct super_block *sb) { struct zonefs_sb_info *sbi = ZONEFS_SB(sb); /* Release the reference on the zone group directory inodes */ zonefs_release_zgroup_inodes(sb); kill_block_super(sb); zonefs_sysfs_unregister(sb); zonefs_free_zgroups(sb); kfree(sbi); } static void zonefs_free_fc(struct fs_context *fc) { struct zonefs_context *ctx = fc->fs_private; kfree(ctx); } static int zonefs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, zonefs_fill_super); } static int zonefs_reconfigure(struct fs_context *fc) { struct zonefs_context *ctx = fc->fs_private; struct super_block *sb = fc->root->d_sb; struct zonefs_sb_info *sbi = sb->s_fs_info; sync_filesystem(fc->root->d_sb); /* Copy new options from ctx into sbi. */ sbi->s_mount_opts = ctx->s_mount_opts; return 0; } static const struct fs_context_operations zonefs_context_ops = { .parse_param = zonefs_parse_param, .get_tree = zonefs_get_tree, .reconfigure = zonefs_reconfigure, .free = zonefs_free_fc, }; /* * Set up the filesystem mount context. */ static int zonefs_init_fs_context(struct fs_context *fc) { struct zonefs_context *ctx; ctx = kzalloc(sizeof(struct zonefs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; fc->ops = &zonefs_context_ops; fc->fs_private = ctx; return 0; } /* * File system definition and registration. */ static struct file_system_type zonefs_type = { .owner = THIS_MODULE, .name = "zonefs", .kill_sb = zonefs_kill_super, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = zonefs_init_fs_context, .parameters = zonefs_param_spec, }; static int __init zonefs_init_inodecache(void) { zonefs_inode_cachep = kmem_cache_create("zonefs_inode_cache", sizeof(struct zonefs_inode_info), 0, SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, NULL); if (zonefs_inode_cachep == NULL) return -ENOMEM; return 0; } static void zonefs_destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy the inode cache. */ rcu_barrier(); kmem_cache_destroy(zonefs_inode_cachep); } static int __init zonefs_init(void) { int ret; BUILD_BUG_ON(sizeof(struct zonefs_super) != ZONEFS_SUPER_SIZE); ret = zonefs_init_inodecache(); if (ret) return ret; ret = zonefs_sysfs_init(); if (ret) goto destroy_inodecache; ret = register_filesystem(&zonefs_type); if (ret) goto sysfs_exit; return 0; sysfs_exit: zonefs_sysfs_exit(); destroy_inodecache: zonefs_destroy_inodecache(); return ret; } static void __exit zonefs_exit(void) { unregister_filesystem(&zonefs_type); zonefs_sysfs_exit(); zonefs_destroy_inodecache(); } MODULE_AUTHOR("Damien Le Moal"); MODULE_DESCRIPTION("Zone file system for zoned block devices"); MODULE_LICENSE("GPL"); MODULE_ALIAS_FS("zonefs"); module_init(zonefs_init); module_exit(zonefs_exit);
6 6 2 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2011 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_nat.h> static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; if (mr->rangesize != 1) { pr_info_ratelimited("multiple ranges no longer supported\n"); return -EINVAL; } return nf_ct_netns_get(par->net, par->family); } static int xt_nat_checkentry(const struct xt_tgchk_param *par) { return nf_ct_netns_get(par->net, par->family); } static void xt_nat_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static void xt_nat_convert_range(struct nf_nat_range2 *dst, const struct nf_nat_ipv4_range *src) { memset(&dst->min_addr, 0, sizeof(dst->min_addr)); memset(&dst->max_addr, 0, sizeof(dst->max_addr)); memset(&dst->base_proto, 0, sizeof(dst->base_proto)); dst->flags = src->flags; dst->min_addr.ip = src->min_ip; dst->max_addr.ip = src->max_ip; dst->min_proto = src->min; dst->max_proto = src->max; } static unsigned int xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); xt_nat_convert_range(&range, &mr->range[0]); return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); } static unsigned int xt_dnat_target_v0(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); xt_nat_convert_range(&range, &mr->range[0]); return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); } static unsigned int xt_snat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_range *range_v1 = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); memcpy(&range, range_v1, sizeof(*range_v1)); memset(&range.base_proto, 0, sizeof(range.base_proto)); return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC); } static unsigned int xt_dnat_target_v1(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_range *range_v1 = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); memcpy(&range, range_v1, sizeof(*range_v1)); memset(&range.base_proto, 0, sizeof(range.base_proto)); return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); } static unsigned int xt_snat_target_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_range2 *range = par->targinfo; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); return nf_nat_setup_info(ct, range, NF_NAT_MANIP_SRC); } static unsigned int xt_dnat_target_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct nf_nat_range2 *range = par->targinfo; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); return nf_nat_setup_info(ct, range, NF_NAT_MANIP_DST); } static struct xt_target xt_nat_target_reg[] __read_mostly = { { .name = "SNAT", .revision = 0, .checkentry = xt_nat_checkentry_v0, .destroy = xt_nat_destroy, .target = xt_snat_target_v0, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .family = NFPROTO_IPV4, .table = "nat", .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, { .name = "DNAT", .revision = 0, .checkentry = xt_nat_checkentry_v0, .destroy = xt_nat_destroy, .target = xt_dnat_target_v0, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .family = NFPROTO_IPV4, .table = "nat", .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, { .name = "SNAT", .revision = 1, .checkentry = xt_nat_checkentry, .destroy = xt_nat_destroy, .target = xt_snat_target_v1, .targetsize = sizeof(struct nf_nat_range), .table = "nat", .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, { .name = "DNAT", .revision = 1, .checkentry = xt_nat_checkentry, .destroy = xt_nat_destroy, .target = xt_dnat_target_v1, .targetsize = sizeof(struct nf_nat_range), .table = "nat", .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, { .name = "SNAT", .revision = 2, .checkentry = xt_nat_checkentry, .destroy = xt_nat_destroy, .target = xt_snat_target_v2, .targetsize = sizeof(struct nf_nat_range2), .table = "nat", .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, { .name = "DNAT", .revision = 2, .checkentry = xt_nat_checkentry, .destroy = xt_nat_destroy, .target = xt_dnat_target_v2, .targetsize = sizeof(struct nf_nat_range2), .table = "nat", .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT), .me = THIS_MODULE, }, }; static int __init xt_nat_init(void) { return xt_register_targets(xt_nat_target_reg, ARRAY_SIZE(xt_nat_target_reg)); } static void __exit xt_nat_exit(void) { xt_unregister_targets(xt_nat_target_reg, ARRAY_SIZE(xt_nat_target_reg)); } module_init(xt_nat_init); module_exit(xt_nat_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS("ipt_SNAT"); MODULE_ALIAS("ipt_DNAT"); MODULE_ALIAS("ip6t_SNAT"); MODULE_ALIAS("ip6t_DNAT"); MODULE_DESCRIPTION("SNAT and DNAT targets support");
6 6 4 6 6 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 // SPDX-License-Identifier: GPL-2.0-only /* * Line 6 Linux USB driver * * Copyright (C) 2004-2010 Markus Grabner (line6@grabner-graz.at) */ #include <linux/slab.h> #include <linux/wait.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/usb.h> #include <sound/core.h> #include <sound/control.h> #include "capture.h" #include "driver.h" #include "playback.h" /* Locate name in binary program dump */ #define POD_NAME_OFFSET 0 #define POD_NAME_LENGTH 16 /* Other constants */ #define POD_CONTROL_SIZE 0x80 #define POD_BUFSIZE_DUMPREQ 7 #define POD_STARTUP_DELAY 1000 /* Stages of POD startup procedure */ enum { POD_STARTUP_VERSIONREQ, POD_STARTUP_SETUP, POD_STARTUP_DONE, }; enum { LINE6_BASSPODXT, LINE6_BASSPODXTLIVE, LINE6_BASSPODXTPRO, LINE6_POCKETPOD, LINE6_PODXT, LINE6_PODXTLIVE_POD, LINE6_PODXTPRO, }; struct usb_line6_pod { /* Generic Line 6 USB data */ struct usb_line6 line6; /* Instrument monitor level */ int monitor_level; /* Current progress in startup procedure */ int startup_progress; /* Serial number of device */ u32 serial_number; /* Firmware version (x 100) */ int firmware_version; /* Device ID */ int device_id; }; #define line6_to_pod(x) container_of(x, struct usb_line6_pod, line6) #define POD_SYSEX_CODE 3 /* *INDENT-OFF* */ enum { POD_SYSEX_SAVE = 0x24, POD_SYSEX_SYSTEM = 0x56, POD_SYSEX_SYSTEMREQ = 0x57, /* POD_SYSEX_UPDATE = 0x6c, */ /* software update! */ POD_SYSEX_STORE = 0x71, POD_SYSEX_FINISH = 0x72, POD_SYSEX_DUMPMEM = 0x73, POD_SYSEX_DUMP = 0x74, POD_SYSEX_DUMPREQ = 0x75 /* dumps entire internal memory of PODxt Pro */ /* POD_SYSEX_DUMPMEM2 = 0x76 */ }; enum { POD_MONITOR_LEVEL = 0x04, POD_SYSTEM_INVALID = 0x10000 }; /* *INDENT-ON* */ enum { POD_DUMP_MEMORY = 2 }; enum { POD_BUSY_READ, POD_BUSY_WRITE, POD_CHANNEL_DIRTY, POD_SAVE_PRESSED, POD_BUSY_MIDISEND }; static const struct snd_ratden pod_ratden = { .num_min = 78125, .num_max = 78125, .num_step = 1, .den = 2 }; static struct line6_pcm_properties pod_pcm_properties = { .playback_hw = { .info = (SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_BLOCK_TRANSFER | SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_PAUSE | SNDRV_PCM_INFO_SYNC_START), .formats = SNDRV_PCM_FMTBIT_S24_3LE, .rates = SNDRV_PCM_RATE_KNOT, .rate_min = 39062, .rate_max = 39063, .channels_min = 2, .channels_max = 2, .buffer_bytes_max = 60000, .period_bytes_min = 64, .period_bytes_max = 8192, .periods_min = 1, .periods_max = 1024}, .capture_hw = { .info = (SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_BLOCK_TRANSFER | SNDRV_PCM_INFO_MMAP_VALID | SNDRV_PCM_INFO_SYNC_START), .formats = SNDRV_PCM_FMTBIT_S24_3LE, .rates = SNDRV_PCM_RATE_KNOT, .rate_min = 39062, .rate_max = 39063, .channels_min = 2, .channels_max = 2, .buffer_bytes_max = 60000, .period_bytes_min = 64, .period_bytes_max = 8192, .periods_min = 1, .periods_max = 1024}, .rates = { .nrats = 1, .rats = &pod_ratden}, .bytes_per_channel = 3 /* SNDRV_PCM_FMTBIT_S24_3LE */ }; static const char pod_version_header[] = { 0xf0, 0x7e, 0x7f, 0x06, 0x02 }; static char *pod_alloc_sysex_buffer(struct usb_line6_pod *pod, int code, int size) { return line6_alloc_sysex_buffer(&pod->line6, POD_SYSEX_CODE, code, size); } /* Process a completely received message. */ static void line6_pod_process_message(struct usb_line6 *line6) { struct usb_line6_pod *pod = line6_to_pod(line6); const unsigned char *buf = pod->line6.buffer_message; if (memcmp(buf, pod_version_header, sizeof(pod_version_header)) == 0) { pod->firmware_version = buf[13] * 100 + buf[14] * 10 + buf[15]; pod->device_id = ((int)buf[8] << 16) | ((int)buf[9] << 8) | (int) buf[10]; if (pod->startup_progress == POD_STARTUP_VERSIONREQ) { pod->startup_progress = POD_STARTUP_SETUP; schedule_delayed_work(&line6->startup_work, 0); } return; } /* Only look for sysex messages from this device */ if (buf[0] != (LINE6_SYSEX_BEGIN | LINE6_CHANNEL_DEVICE) && buf[0] != (LINE6_SYSEX_BEGIN | LINE6_CHANNEL_UNKNOWN)) { return; } if (memcmp(buf + 1, line6_midi_id, sizeof(line6_midi_id)) != 0) return; if (buf[5] == POD_SYSEX_SYSTEM && buf[6] == POD_MONITOR_LEVEL) { short value = ((int)buf[7] << 12) | ((int)buf[8] << 8) | ((int)buf[9] << 4) | (int)buf[10]; pod->monitor_level = value; } } /* Send system parameter (from integer). */ static int pod_set_system_param_int(struct usb_line6_pod *pod, int value, int code) { char *sysex; static const int size = 5; sysex = pod_alloc_sysex_buffer(pod, POD_SYSEX_SYSTEM, size); if (!sysex) return -ENOMEM; sysex[SYSEX_DATA_OFS] = code; sysex[SYSEX_DATA_OFS + 1] = (value >> 12) & 0x0f; sysex[SYSEX_DATA_OFS + 2] = (value >> 8) & 0x0f; sysex[SYSEX_DATA_OFS + 3] = (value >> 4) & 0x0f; sysex[SYSEX_DATA_OFS + 4] = (value) & 0x0f; line6_send_sysex_message(&pod->line6, sysex, size); kfree(sysex); return 0; } /* "read" request on "serial_number" special file. */ static ssize_t serial_number_show(struct device *dev, struct device_attribute *attr, char *buf) { struct snd_card *card = dev_to_snd_card(dev); struct usb_line6_pod *pod = card->private_data; return sysfs_emit(buf, "%u\n", pod->serial_number); } /* "read" request on "firmware_version" special file. */ static ssize_t firmware_version_show(struct device *dev, struct device_attribute *attr, char *buf) { struct snd_card *card = dev_to_snd_card(dev); struct usb_line6_pod *pod = card->private_data; return sysfs_emit(buf, "%d.%02d\n", pod->firmware_version / 100, pod->firmware_version % 100); } /* "read" request on "device_id" special file. */ static ssize_t device_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct snd_card *card = dev_to_snd_card(dev); struct usb_line6_pod *pod = card->private_data; return sysfs_emit(buf, "%d\n", pod->device_id); } /* POD startup procedure. This is a sequence of functions with special requirements (e.g., must not run immediately after initialization, must not run in interrupt context). After the last one has finished, the device is ready to use. */ static void pod_startup(struct usb_line6 *line6) { struct usb_line6_pod *pod = line6_to_pod(line6); switch (pod->startup_progress) { case POD_STARTUP_VERSIONREQ: /* request firmware version: */ line6_version_request_async(line6); break; case POD_STARTUP_SETUP: /* serial number: */ line6_read_serial_number(&pod->line6, &pod->serial_number); /* ALSA audio interface: */ if (snd_card_register(line6->card)) dev_err(line6->ifcdev, "Failed to register POD card.\n"); pod->startup_progress = POD_STARTUP_DONE; break; default: break; } } /* POD special files: */ static DEVICE_ATTR_RO(device_id); static DEVICE_ATTR_RO(firmware_version); static DEVICE_ATTR_RO(serial_number); static struct attribute *pod_dev_attrs[] = { &dev_attr_device_id.attr, &dev_attr_firmware_version.attr, &dev_attr_serial_number.attr, NULL }; static const struct attribute_group pod_dev_attr_group = { .name = "pod", .attrs = pod_dev_attrs, }; /* control info callback */ static int snd_pod_control_monitor_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 1; uinfo->value.integer.min = 0; uinfo->value.integer.max = 65535; return 0; } /* control get callback */ static int snd_pod_control_monitor_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_line6_pcm *line6pcm = snd_kcontrol_chip(kcontrol); struct usb_line6_pod *pod = line6_to_pod(line6pcm->line6); ucontrol->value.integer.value[0] = pod->monitor_level; return 0; } /* control put callback */ static int snd_pod_control_monitor_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_line6_pcm *line6pcm = snd_kcontrol_chip(kcontrol); struct usb_line6_pod *pod = line6_to_pod(line6pcm->line6); if (ucontrol->value.integer.value[0] == pod->monitor_level) return 0; pod->monitor_level = ucontrol->value.integer.value[0]; pod_set_system_param_int(pod, ucontrol->value.integer.value[0], POD_MONITOR_LEVEL); return 1; } /* control definition */ static const struct snd_kcontrol_new pod_control_monitor = { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "Monitor Playback Volume", .index = 0, .access = SNDRV_CTL_ELEM_ACCESS_READWRITE, .info = snd_pod_control_monitor_info, .get = snd_pod_control_monitor_get, .put = snd_pod_control_monitor_put }; /* Try to init POD device. */ static int pod_init(struct usb_line6 *line6, const struct usb_device_id *id) { int err; struct usb_line6_pod *pod = line6_to_pod(line6); line6->process_message = line6_pod_process_message; line6->startup = pod_startup; /* create sysfs entries: */ err = snd_card_add_dev_attr(line6->card, &pod_dev_attr_group); if (err < 0) return err; /* initialize PCM subsystem: */ err = line6_init_pcm(line6, &pod_pcm_properties); if (err < 0) return err; /* register monitor control: */ err = snd_ctl_add(line6->card, snd_ctl_new1(&pod_control_monitor, line6->line6pcm)); if (err < 0) return err; /* When the sound card is registered at this point, the PODxt Live displays "Invalid Code Error 07", so we do it later in the event handler. */ if (pod->line6.properties->capabilities & LINE6_CAP_CONTROL) { pod->monitor_level = POD_SYSTEM_INVALID; /* initiate startup procedure: */ schedule_delayed_work(&line6->startup_work, msecs_to_jiffies(POD_STARTUP_DELAY)); } return 0; } #define LINE6_DEVICE(prod) USB_DEVICE(0x0e41, prod) #define LINE6_IF_NUM(prod, n) USB_DEVICE_INTERFACE_NUMBER(0x0e41, prod, n) /* table of devices that work with this driver */ static const struct usb_device_id pod_id_table[] = { { LINE6_DEVICE(0x4250), .driver_info = LINE6_BASSPODXT }, { LINE6_DEVICE(0x4642), .driver_info = LINE6_BASSPODXTLIVE }, { LINE6_DEVICE(0x4252), .driver_info = LINE6_BASSPODXTPRO }, { LINE6_IF_NUM(0x5051, 1), .driver_info = LINE6_POCKETPOD }, { LINE6_DEVICE(0x5044), .driver_info = LINE6_PODXT }, { LINE6_IF_NUM(0x4650, 0), .driver_info = LINE6_PODXTLIVE_POD }, { LINE6_DEVICE(0x5050), .driver_info = LINE6_PODXTPRO }, {} }; MODULE_DEVICE_TABLE(usb, pod_id_table); static const struct line6_properties pod_properties_table[] = { [LINE6_BASSPODXT] = { .id = "BassPODxt", .name = "BassPODxt", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 5, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_BASSPODXTLIVE] = { .id = "BassPODxtLive", .name = "BassPODxt Live", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 1, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_BASSPODXTPRO] = { .id = "BassPODxtPro", .name = "BassPODxt Pro", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 5, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_POCKETPOD] = { .id = "PocketPOD", .name = "Pocket POD", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI, .altsetting = 0, .ep_ctrl_r = 0x82, .ep_ctrl_w = 0x02, /* no audio channel */ }, [LINE6_PODXT] = { .id = "PODxt", .name = "PODxt", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 5, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_PODXTLIVE_POD] = { .id = "PODxtLive", .name = "PODxt Live", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 1, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, [LINE6_PODXTPRO] = { .id = "PODxtPro", .name = "PODxt Pro", .capabilities = LINE6_CAP_CONTROL | LINE6_CAP_CONTROL_MIDI | LINE6_CAP_PCM | LINE6_CAP_HWMON, .altsetting = 5, .ep_ctrl_r = 0x84, .ep_ctrl_w = 0x03, .ep_audio_r = 0x82, .ep_audio_w = 0x01, }, }; /* Probe USB device. */ static int pod_probe(struct usb_interface *interface, const struct usb_device_id *id) { return line6_probe(interface, id, "Line6-POD", &pod_properties_table[id->driver_info], pod_init, sizeof(struct usb_line6_pod)); } static struct usb_driver pod_driver = { .name = KBUILD_MODNAME, .probe = pod_probe, .disconnect = line6_disconnect, #ifdef CONFIG_PM .suspend = line6_suspend, .resume = line6_resume, .reset_resume = line6_resume, #endif .id_table = pod_id_table, }; module_usb_driver(pod_driver); MODULE_DESCRIPTION("Line 6 POD USB driver"); MODULE_LICENSE("GPL");
43 36 42 83 24 11 4 1 3 2 1 4 4 4 4 4 4 1 4 3 1 1 2 2 4 5 5 2 5 3 1 1 2 2 29 1 4 3 1 7 3 5 4 3 1 1 5 3 1 2 2 2 4 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 // SPDX-License-Identifier: GPL-2.0-only /* * kernel/power/main.c - PM subsystem core functionality. * * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab */ #include <linux/acpi.h> #include <linux/export.h> #include <linux/kobject.h> #include <linux/string.h> #include <linux/pm-trace.h> #include <linux/workqueue.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/suspend.h> #include <linux/syscalls.h> #include <linux/pm_runtime.h> #include "power.h" #ifdef CONFIG_PM_SLEEP /* * The following functions are used by the suspend/hibernate code to temporarily * change gfp_allowed_mask in order to avoid using I/O during memory allocations * while devices are suspended. To avoid races with the suspend/hibernate code, * they should always be called with system_transition_mutex held * (gfp_allowed_mask also should only be modified with system_transition_mutex * held, unless the suspend/hibernate code is guaranteed not to run in parallel * with that modification). */ static gfp_t saved_gfp_mask; void pm_restore_gfp_mask(void) { WARN_ON(!mutex_is_locked(&system_transition_mutex)); if (saved_gfp_mask) { gfp_allowed_mask = saved_gfp_mask; saved_gfp_mask = 0; } } void pm_restrict_gfp_mask(void) { WARN_ON(!mutex_is_locked(&system_transition_mutex)); WARN_ON(saved_gfp_mask); saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); } unsigned int lock_system_sleep(void) { unsigned int flags = current->flags; current->flags |= PF_NOFREEZE; mutex_lock(&system_transition_mutex); return flags; } EXPORT_SYMBOL_GPL(lock_system_sleep); void unlock_system_sleep(unsigned int flags) { if (!(flags & PF_NOFREEZE)) current->flags &= ~PF_NOFREEZE; mutex_unlock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(unlock_system_sleep); void ksys_sync_helper(void) { ktime_t start; long elapsed_msecs; start = ktime_get(); ksys_sync(); elapsed_msecs = ktime_to_ms(ktime_sub(ktime_get(), start)); pr_info("Filesystems sync: %ld.%03ld seconds\n", elapsed_msecs / MSEC_PER_SEC, elapsed_msecs % MSEC_PER_SEC); } EXPORT_SYMBOL_GPL(ksys_sync_helper); /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); int register_pm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&pm_chain_head, nb); } EXPORT_SYMBOL_GPL(register_pm_notifier); int unregister_pm_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&pm_chain_head, nb); } EXPORT_SYMBOL_GPL(unregister_pm_notifier); int pm_notifier_call_chain_robust(unsigned long val_up, unsigned long val_down) { int ret; ret = blocking_notifier_call_chain_robust(&pm_chain_head, val_up, val_down, NULL); return notifier_to_errno(ret); } int pm_notifier_call_chain(unsigned long val) { return blocking_notifier_call_chain(&pm_chain_head, val, NULL); } /* If set, devices may be suspended and resumed asynchronously. */ int pm_async_enabled = 1; static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", pm_async_enabled); } static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_async_enabled = val; return n; } power_attr(pm_async); #ifdef CONFIG_SUSPEND static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t count = 0; suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) { if (i >= PM_SUSPEND_MEM && cxl_mem_active()) continue; if (mem_sleep_states[i]) { const char *label = mem_sleep_states[i]; if (mem_sleep_current == i) count += sysfs_emit_at(buf, count, "[%s] ", label); else count += sysfs_emit_at(buf, count, "%s ", label); } } /* Convert the last space to a newline if needed. */ if (count > 0) buf[count - 1] = '\n'; return count; } static suspend_state_t decode_suspend_state(const char *buf, size_t n) { suspend_state_t state; char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { const char *label = mem_sleep_states[state]; if (label && len == strlen(label) && !strncmp(buf, label, len)) return state; } return PM_SUSPEND_ON; } static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } state = decode_suspend_state(buf, n); if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON) mem_sleep_current = state; else error = -EINVAL; out: pm_autosleep_unlock(); return error ? error : n; } power_attr(mem_sleep); /* * sync_on_suspend: invoke ksys_sync_helper() before suspend. * * show() returns whether ksys_sync_helper() is invoked before suspend. * store() accepts 0 or 1. 0 disables ksys_sync_helper() and 1 enables it. */ bool sync_on_suspend_enabled = !IS_ENABLED(CONFIG_SUSPEND_SKIP_SYNC); static ssize_t sync_on_suspend_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", sync_on_suspend_enabled); } static ssize_t sync_on_suspend_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; sync_on_suspend_enabled = !!val; return n; } power_attr(sync_on_suspend); #endif /* CONFIG_SUSPEND */ #ifdef CONFIG_PM_SLEEP_DEBUG int pm_test_level = TEST_NONE; static const char * const pm_tests[__TEST_AFTER_LAST] = { [TEST_NONE] = "none", [TEST_CORE] = "core", [TEST_CPUS] = "processors", [TEST_PLATFORM] = "platform", [TEST_DEVICES] = "devices", [TEST_FREEZER] = "freezer", }; static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t count = 0; int level; for (level = TEST_FIRST; level <= TEST_MAX; level++) if (pm_tests[level]) { if (level == pm_test_level) count += sysfs_emit_at(buf, count, "[%s] ", pm_tests[level]); else count += sysfs_emit_at(buf, count, "%s ", pm_tests[level]); } /* Convert the last space to a newline if needed. */ if (count > 0) buf[count - 1] = '\n'; return count; } static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned int sleep_flags; const char * const *s; int error = -EINVAL; int level; char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; sleep_flags = lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { pm_test_level = level; error = 0; break; } unlock_system_sleep(sleep_flags); return error ? error : n; } power_attr(pm_test); #endif /* CONFIG_PM_SLEEP_DEBUG */ #define SUSPEND_NR_STEPS SUSPEND_RESUME #define REC_FAILED_NUM 2 struct suspend_stats { unsigned int step_failures[SUSPEND_NR_STEPS]; unsigned int success; unsigned int fail; int last_failed_dev; char failed_devs[REC_FAILED_NUM][40]; int last_failed_errno; int errno[REC_FAILED_NUM]; int last_failed_step; u64 last_hw_sleep; u64 total_hw_sleep; u64 max_hw_sleep; enum suspend_stat_step failed_steps[REC_FAILED_NUM]; }; static struct suspend_stats suspend_stats; static DEFINE_MUTEX(suspend_stats_lock); void dpm_save_failed_dev(const char *name) { mutex_lock(&suspend_stats_lock); strscpy(suspend_stats.failed_devs[suspend_stats.last_failed_dev], name, sizeof(suspend_stats.failed_devs[0])); suspend_stats.last_failed_dev++; suspend_stats.last_failed_dev %= REC_FAILED_NUM; mutex_unlock(&suspend_stats_lock); } void dpm_save_failed_step(enum suspend_stat_step step) { suspend_stats.step_failures[step-1]++; suspend_stats.failed_steps[suspend_stats.last_failed_step] = step; suspend_stats.last_failed_step++; suspend_stats.last_failed_step %= REC_FAILED_NUM; } void dpm_save_errno(int err) { if (!err) { suspend_stats.success++; return; } suspend_stats.fail++; suspend_stats.errno[suspend_stats.last_failed_errno] = err; suspend_stats.last_failed_errno++; suspend_stats.last_failed_errno %= REC_FAILED_NUM; } void pm_report_hw_sleep_time(u64 t) { suspend_stats.last_hw_sleep = t; suspend_stats.total_hw_sleep += t; } EXPORT_SYMBOL_GPL(pm_report_hw_sleep_time); void pm_report_max_hw_sleep(u64 t) { suspend_stats.max_hw_sleep = t; } EXPORT_SYMBOL_GPL(pm_report_max_hw_sleep); static const char * const suspend_step_names[] = { [SUSPEND_WORKING] = "", [SUSPEND_FREEZE] = "freeze", [SUSPEND_PREPARE] = "prepare", [SUSPEND_SUSPEND] = "suspend", [SUSPEND_SUSPEND_LATE] = "suspend_late", [SUSPEND_SUSPEND_NOIRQ] = "suspend_noirq", [SUSPEND_RESUME_NOIRQ] = "resume_noirq", [SUSPEND_RESUME_EARLY] = "resume_early", [SUSPEND_RESUME] = "resume", }; #define suspend_attr(_name, format_str) \ static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ { \ return sysfs_emit(buf, format_str, suspend_stats._name);\ } \ static struct kobj_attribute _name = __ATTR_RO(_name) suspend_attr(success, "%u\n"); suspend_attr(fail, "%u\n"); suspend_attr(last_hw_sleep, "%llu\n"); suspend_attr(total_hw_sleep, "%llu\n"); suspend_attr(max_hw_sleep, "%llu\n"); #define suspend_step_attr(_name, step) \ static ssize_t _name##_show(struct kobject *kobj, \ struct kobj_attribute *attr, char *buf) \ { \ return sysfs_emit(buf, "%u\n", \ suspend_stats.step_failures[step-1]); \ } \ static struct kobj_attribute _name = __ATTR_RO(_name) suspend_step_attr(failed_freeze, SUSPEND_FREEZE); suspend_step_attr(failed_prepare, SUSPEND_PREPARE); suspend_step_attr(failed_suspend, SUSPEND_SUSPEND); suspend_step_attr(failed_suspend_late, SUSPEND_SUSPEND_LATE); suspend_step_attr(failed_suspend_noirq, SUSPEND_SUSPEND_NOIRQ); suspend_step_attr(failed_resume, SUSPEND_RESUME); suspend_step_attr(failed_resume_early, SUSPEND_RESUME_EARLY); suspend_step_attr(failed_resume_noirq, SUSPEND_RESUME_NOIRQ); static ssize_t last_failed_dev_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int index; char *last_failed_dev = NULL; index = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; last_failed_dev = suspend_stats.failed_devs[index]; return sysfs_emit(buf, "%s\n", last_failed_dev); } static struct kobj_attribute last_failed_dev = __ATTR_RO(last_failed_dev); static ssize_t last_failed_errno_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { int index; int last_failed_errno; index = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; last_failed_errno = suspend_stats.errno[index]; return sysfs_emit(buf, "%d\n", last_failed_errno); } static struct kobj_attribute last_failed_errno = __ATTR_RO(last_failed_errno); static ssize_t last_failed_step_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { enum suspend_stat_step step; int index; index = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; index %= REC_FAILED_NUM; step = suspend_stats.failed_steps[index]; return sysfs_emit(buf, "%s\n", suspend_step_names[step]); } static struct kobj_attribute last_failed_step = __ATTR_RO(last_failed_step); static struct attribute *suspend_attrs[] = { &success.attr, &fail.attr, &failed_freeze.attr, &failed_prepare.attr, &failed_suspend.attr, &failed_suspend_late.attr, &failed_suspend_noirq.attr, &failed_resume.attr, &failed_resume_early.attr, &failed_resume_noirq.attr, &last_failed_dev.attr, &last_failed_errno.attr, &last_failed_step.attr, &last_hw_sleep.attr, &total_hw_sleep.attr, &max_hw_sleep.attr, NULL, }; static umode_t suspend_attr_is_visible(struct kobject *kobj, struct attribute *attr, int idx) { if (attr != &last_hw_sleep.attr && attr != &total_hw_sleep.attr && attr != &max_hw_sleep.attr) return 0444; #ifdef CONFIG_ACPI if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) return 0444; #endif return 0; } static const struct attribute_group suspend_attr_group = { .name = "suspend_stats", .attrs = suspend_attrs, .is_visible = suspend_attr_is_visible, }; #ifdef CONFIG_DEBUG_FS static int suspend_stats_show(struct seq_file *s, void *unused) { int i, index, last_dev, last_errno, last_step; enum suspend_stat_step step; last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; last_dev %= REC_FAILED_NUM; last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; seq_printf(s, "success: %u\nfail: %u\n", suspend_stats.success, suspend_stats.fail); for (step = SUSPEND_FREEZE; step <= SUSPEND_NR_STEPS; step++) seq_printf(s, "failed_%s: %u\n", suspend_step_names[step], suspend_stats.step_failures[step-1]); seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", suspend_stats.failed_devs[last_dev]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_dev + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-s\n", suspend_stats.failed_devs[index]); } seq_printf(s, " last_failed_errno:\t%-d\n", suspend_stats.errno[last_errno]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_errno + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-d\n", suspend_stats.errno[index]); } seq_printf(s, " last_failed_step:\t%-s\n", suspend_step_names[suspend_stats.failed_steps[last_step]]); for (i = 1; i < REC_FAILED_NUM; i++) { index = last_step + REC_FAILED_NUM - i; index %= REC_FAILED_NUM; seq_printf(s, "\t\t\t%-s\n", suspend_step_names[suspend_stats.failed_steps[index]]); } return 0; } DEFINE_SHOW_ATTRIBUTE(suspend_stats); static int __init pm_debugfs_init(void) { debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, NULL, NULL, &suspend_stats_fops); return 0; } late_initcall(pm_debugfs_init); #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_SLEEP_DEBUG /* * pm_print_times: print time taken by devices to suspend and resume. * * show() returns whether printing of suspend and resume times is enabled. * store() accepts 0 or 1. 0 disables printing and 1 enables it. */ bool pm_print_times_enabled; static ssize_t pm_print_times_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", pm_print_times_enabled); } static ssize_t pm_print_times_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_print_times_enabled = !!val; return n; } power_attr(pm_print_times); static inline void pm_print_times_init(void) { pm_print_times_enabled = !!initcall_debug; } static ssize_t pm_wakeup_irq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { if (!pm_wakeup_irq()) return -ENODATA; return sysfs_emit(buf, "%u\n", pm_wakeup_irq()); } power_attr_ro(pm_wakeup_irq); bool pm_debug_messages_on __read_mostly; bool pm_debug_messages_should_print(void) { return pm_debug_messages_on && pm_suspend_target_state != PM_SUSPEND_ON; } EXPORT_SYMBOL_GPL(pm_debug_messages_should_print); static ssize_t pm_debug_messages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", pm_debug_messages_on); } static ssize_t pm_debug_messages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; if (val > 1) return -EINVAL; pm_debug_messages_on = !!val; return n; } power_attr(pm_debug_messages); static int __init pm_debug_messages_setup(char *str) { pm_debug_messages_on = true; return 1; } __setup("pm_debug_messages", pm_debug_messages_setup); #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} #endif /* CONFIG_PM_SLEEP_DEBUG */ struct kobject *power_kobj; /* * state - control system sleep states. * * show() returns available sleep state labels, which may be "mem", "standby", * "freeze" and "disk" (hibernation). * See Documentation/admin-guide/pm/sleep-states.rst for a description of * what they mean. * * store() accepts one of those strings, translates it into the proper * enumerated value, and initiates a suspend transition. */ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { ssize_t count = 0; #ifdef CONFIG_SUSPEND suspend_state_t i; for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) if (pm_states[i]) count += sysfs_emit_at(buf, count, "%s ", pm_states[i]); #endif if (hibernation_available()) count += sysfs_emit_at(buf, count, "disk "); /* Convert the last space to a newline if needed. */ if (count > 0) buf[count - 1] = '\n'; return count; } static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND suspend_state_t state; #endif char *p; int len; p = memchr(buf, '\n', n); len = p ? p - buf : n; /* Check hibernation first. */ if (len == 4 && str_has_prefix(buf, "disk")) return PM_SUSPEND_MAX; #ifdef CONFIG_SUSPEND for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { const char *label = pm_states[state]; if (label && len == strlen(label) && !strncmp(buf, label, len)) return state; } #endif return PM_SUSPEND_ON; } static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } state = decode_state(buf, n); if (state < PM_SUSPEND_MAX) { if (state == PM_SUSPEND_MEM) state = mem_sleep_current; error = pm_suspend(state); } else if (state == PM_SUSPEND_MAX) { error = hibernate(); } else { error = -EINVAL; } out: pm_autosleep_unlock(); return error ? error : n; } power_attr(state); #ifdef CONFIG_PM_SLEEP /* * The 'wakeup_count' attribute, along with the functions defined in * drivers/base/power/wakeup.c, provides a means by which wakeup events can be * handled in a non-racy way. * * If a wakeup event occurs when the system is in a sleep state, it simply is * woken up. In turn, if an event that would wake the system up from a sleep * state occurs when it is undergoing a transition to that sleep state, the * transition should be aborted. Moreover, if such an event occurs when the * system is in the working state, an attempt to start a transition to the * given sleep state should fail during certain period after the detection of * the event. Using the 'state' attribute alone is not sufficient to satisfy * these requirements, because a wakeup event may occur exactly when 'state' * is being written to and may be delivered to user space right before it is * frozen, so the event will remain only partially processed until the system is * woken up by another event. In particular, it won't cause the transition to * a sleep state to be aborted. * * This difficulty may be overcome if user space uses 'wakeup_count' before * writing to 'state'. It first should read from 'wakeup_count' and store * the read value. Then, after carrying out its own preparations for the system * transition to a sleep state, it should write the stored value to * 'wakeup_count'. If that fails, at least one wakeup event has occurred since * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it * is allowed to write to 'state', but the transition will be aborted if there * are any wakeup events detected after 'wakeup_count' was written to. */ static ssize_t wakeup_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { unsigned int val; return pm_get_wakeup_count(&val, true) ? sysfs_emit(buf, "%u\n", val) : -EINTR; } static ssize_t wakeup_count_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned int val; int error; error = pm_autosleep_lock(); if (error) return error; if (pm_autosleep_state() > PM_SUSPEND_ON) { error = -EBUSY; goto out; } error = -EINVAL; if (sscanf(buf, "%u", &val) == 1) { if (pm_save_wakeup_count(val)) error = n; else pm_print_active_wakeup_sources(); } out: pm_autosleep_unlock(); return error; } power_attr(wakeup_count); #ifdef CONFIG_PM_AUTOSLEEP static ssize_t autosleep_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { suspend_state_t state = pm_autosleep_state(); if (state == PM_SUSPEND_ON) return sysfs_emit(buf, "off\n"); #ifdef CONFIG_SUSPEND if (state < PM_SUSPEND_MAX) return sysfs_emit(buf, "%s\n", pm_states[state] ? pm_states[state] : "error"); #endif #ifdef CONFIG_HIBERNATION return sysfs_emit(buf, "disk\n"); #else return sysfs_emit(buf, "error\n"); #endif } static ssize_t autosleep_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { suspend_state_t state = decode_state(buf, n); int error; if (state == PM_SUSPEND_ON && strcmp(buf, "off") && strcmp(buf, "off\n")) return -EINVAL; if (state == PM_SUSPEND_MEM) state = mem_sleep_current; error = pm_autosleep_set_state(state); return error ? error : n; } power_attr(autosleep); #endif /* CONFIG_PM_AUTOSLEEP */ #ifdef CONFIG_PM_WAKELOCKS static ssize_t wake_lock_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return pm_show_wakelocks(buf, true); } static ssize_t wake_lock_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int error = pm_wake_lock(buf); return error ? error : n; } power_attr(wake_lock); static ssize_t wake_unlock_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return pm_show_wakelocks(buf, false); } static ssize_t wake_unlock_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int error = pm_wake_unlock(buf); return error ? error : n; } power_attr(wake_unlock); #endif /* CONFIG_PM_WAKELOCKS */ #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_TRACE int pm_trace_enabled; static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", pm_trace_enabled); } static ssize_t pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { int val; if (sscanf(buf, "%d", &val) == 1) { pm_trace_enabled = !!val; if (pm_trace_enabled) { pr_warn("PM: Enabling pm_trace changes system date and time during resume.\n" "PM: Correct system time has to be restored manually after resume.\n"); } return n; } return -EINVAL; } power_attr(pm_trace); static ssize_t pm_trace_dev_match_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return show_trace_dev_match(buf, PAGE_SIZE); } power_attr_ro(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ #ifdef CONFIG_FREEZER static ssize_t pm_freeze_timeout_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%u\n", freeze_timeout_msecs); } static ssize_t pm_freeze_timeout_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { unsigned long val; if (kstrtoul(buf, 10, &val)) return -EINVAL; freeze_timeout_msecs = val; return n; } power_attr(pm_freeze_timeout); #endif /* CONFIG_FREEZER*/ static struct attribute * g[] = { &state_attr.attr, #ifdef CONFIG_PM_TRACE &pm_trace_attr.attr, &pm_trace_dev_match_attr.attr, #endif #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, &wakeup_count_attr.attr, #ifdef CONFIG_SUSPEND &mem_sleep_attr.attr, &sync_on_suspend_attr.attr, #endif #ifdef CONFIG_PM_AUTOSLEEP &autosleep_attr.attr, #endif #ifdef CONFIG_PM_WAKELOCKS &wake_lock_attr.attr, &wake_unlock_attr.attr, #endif #ifdef CONFIG_PM_SLEEP_DEBUG &pm_test_attr.attr, &pm_print_times_attr.attr, &pm_wakeup_irq_attr.attr, &pm_debug_messages_attr.attr, #endif #endif #ifdef CONFIG_FREEZER &pm_freeze_timeout_attr.attr, #endif NULL, }; static const struct attribute_group attr_group = { .attrs = g, }; static const struct attribute_group *attr_groups[] = { &attr_group, #ifdef CONFIG_PM_SLEEP &suspend_attr_group, #endif NULL, }; struct workqueue_struct *pm_wq; EXPORT_SYMBOL_GPL(pm_wq); static int __init pm_start_workqueue(void) { pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); return pm_wq ? 0 : -ENOMEM; } static int __init pm_init(void) { int error = pm_start_workqueue(); if (error) return error; hibernate_image_size_init(); hibernate_reserved_size_init(); pm_states_init(); power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; error = sysfs_create_groups(power_kobj, attr_groups); if (error) return error; pm_print_times_init(); return pm_autosleep_init(); } core_initcall(pm_init);
6 2 3 2 2 5 2 4 3 4 3 3 4 1 3 1 4 4 6 6 145 145 145 145 1 1 1 1 4 1 3 8 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 // SPDX-License-Identifier: GPL-2.0-only /* * inode.c - part of tracefs, a pseudo file system for activating tracing * * Based on debugfs by: Greg Kroah-Hartman <greg@kroah.com> * * Copyright (C) 2014 Red Hat Inc, author: Steven Rostedt <srostedt@redhat.com> * * tracefs is the file system that is used by the tracing infrastructure. */ #include <linux/module.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/kobject.h> #include <linux/namei.h> #include <linux/tracefs.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/seq_file.h> #include <linux/magic.h> #include <linux/slab.h> #include "internal.h" #define TRACEFS_DEFAULT_MODE 0700 static struct kmem_cache *tracefs_inode_cachep __ro_after_init; static struct vfsmount *tracefs_mount; static int tracefs_mount_count; static bool tracefs_registered; /* * Keep track of all tracefs_inodes in order to update their * flags if necessary on a remount. */ static DEFINE_SPINLOCK(tracefs_inode_lock); static LIST_HEAD(tracefs_inodes); static struct inode *tracefs_alloc_inode(struct super_block *sb) { struct tracefs_inode *ti; unsigned long flags; ti = alloc_inode_sb(sb, tracefs_inode_cachep, GFP_KERNEL); if (!ti) return NULL; spin_lock_irqsave(&tracefs_inode_lock, flags); list_add_rcu(&ti->list, &tracefs_inodes); spin_unlock_irqrestore(&tracefs_inode_lock, flags); return &ti->vfs_inode; } static void tracefs_free_inode(struct inode *inode) { struct tracefs_inode *ti = get_tracefs(inode); kmem_cache_free(tracefs_inode_cachep, ti); } static void tracefs_destroy_inode(struct inode *inode) { struct tracefs_inode *ti = get_tracefs(inode); unsigned long flags; spin_lock_irqsave(&tracefs_inode_lock, flags); list_del_rcu(&ti->list); spin_unlock_irqrestore(&tracefs_inode_lock, flags); } static ssize_t default_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) { return 0; } static ssize_t default_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { return count; } static const struct file_operations tracefs_file_operations = { .read = default_read_file, .write = default_write_file, .open = simple_open, .llseek = noop_llseek, }; static struct tracefs_dir_ops { int (*mkdir)(const char *name); int (*rmdir)(const char *name); } tracefs_ops __ro_after_init; static char *get_dname(struct dentry *dentry) { const char *dname; char *name; int len = dentry->d_name.len; dname = dentry->d_name.name; name = kmalloc(len + 1, GFP_KERNEL); if (!name) return NULL; memcpy(name, dname, len); name[len] = 0; return name; } static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, struct inode *inode, struct dentry *dentry, umode_t mode) { struct tracefs_inode *ti; char *name; int ret; name = get_dname(dentry); if (!name) return -ENOMEM; /* * This is a new directory that does not take the default of * the rootfs. It becomes the default permissions for all the * files and directories underneath it. */ ti = get_tracefs(inode); ti->flags |= TRACEFS_INSTANCE_INODE; ti->private = inode; /* * The mkdir call can call the generic functions that create * the files within the tracefs system. It is up to the individual * mkdir routine to handle races. */ inode_unlock(inode); ret = tracefs_ops.mkdir(name); inode_lock(inode); kfree(name); return ret; } static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) { char *name; int ret; name = get_dname(dentry); if (!name) return -ENOMEM; /* * The rmdir call can call the generic functions that create * the files within the tracefs system. It is up to the individual * rmdir routine to handle races. * This time we need to unlock not only the parent (inode) but * also the directory that is being deleted. */ inode_unlock(inode); inode_unlock(d_inode(dentry)); ret = tracefs_ops.rmdir(name); inode_lock_nested(inode, I_MUTEX_PARENT); inode_lock(d_inode(dentry)); kfree(name); return ret; } static void set_tracefs_inode_owner(struct inode *inode) { struct tracefs_inode *ti = get_tracefs(inode); struct inode *root_inode = ti->private; kuid_t uid; kgid_t gid; uid = root_inode->i_uid; gid = root_inode->i_gid; /* * If the root is not the mount point, then check the root's * permissions. If it was never set, then default to the * mount point. */ if (root_inode != d_inode(root_inode->i_sb->s_root)) { struct tracefs_inode *rti; rti = get_tracefs(root_inode); root_inode = d_inode(root_inode->i_sb->s_root); if (!(rti->flags & TRACEFS_UID_PERM_SET)) uid = root_inode->i_uid; if (!(rti->flags & TRACEFS_GID_PERM_SET)) gid = root_inode->i_gid; } /* * If this inode has never been referenced, then update * the permissions to the superblock. */ if (!(ti->flags & TRACEFS_UID_PERM_SET)) inode->i_uid = uid; if (!(ti->flags & TRACEFS_GID_PERM_SET)) inode->i_gid = gid; } static int tracefs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { set_tracefs_inode_owner(inode); return generic_permission(idmap, inode, mask); } static int tracefs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_backing_inode(path->dentry); set_tracefs_inode_owner(inode); generic_fillattr(idmap, request_mask, inode, stat); return 0; } static int tracefs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; struct inode *inode = d_inode(dentry); struct tracefs_inode *ti = get_tracefs(inode); if (ia_valid & ATTR_UID) ti->flags |= TRACEFS_UID_PERM_SET; if (ia_valid & ATTR_GID) ti->flags |= TRACEFS_GID_PERM_SET; return simple_setattr(idmap, dentry, attr); } static const struct inode_operations tracefs_instance_dir_inode_operations = { .lookup = simple_lookup, .mkdir = tracefs_syscall_mkdir, .rmdir = tracefs_syscall_rmdir, .permission = tracefs_permission, .getattr = tracefs_getattr, .setattr = tracefs_setattr, }; static const struct inode_operations tracefs_dir_inode_operations = { .lookup = simple_lookup, .permission = tracefs_permission, .getattr = tracefs_getattr, .setattr = tracefs_setattr, }; static const struct inode_operations tracefs_file_inode_operations = { .permission = tracefs_permission, .getattr = tracefs_getattr, .setattr = tracefs_setattr, }; struct inode *tracefs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); simple_inode_init_ts(inode); } return inode; } struct tracefs_fs_info { kuid_t uid; kgid_t gid; umode_t mode; /* Opt_* bitfield. */ unsigned int opts; }; enum { Opt_uid, Opt_gid, Opt_mode, }; static const struct fs_parameter_spec tracefs_param_specs[] = { fsparam_gid ("gid", Opt_gid), fsparam_u32oct ("mode", Opt_mode), fsparam_uid ("uid", Opt_uid), {} }; static int tracefs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct tracefs_fs_info *opts = fc->s_fs_info; struct fs_parse_result result; int opt; opt = fs_parse(fc, tracefs_param_specs, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_uid: opts->uid = result.uid; break; case Opt_gid: opts->gid = result.gid; break; case Opt_mode: opts->mode = result.uint_32 & S_IALLUGO; break; /* * We might like to report bad mount options here; * but traditionally tracefs has ignored all mount options */ } opts->opts |= BIT(opt); return 0; } static int tracefs_apply_options(struct super_block *sb, bool remount) { struct tracefs_fs_info *fsi = sb->s_fs_info; struct inode *inode = d_inode(sb->s_root); struct tracefs_inode *ti; bool update_uid, update_gid; umode_t tmp_mode; /* * On remount, only reset mode/uid/gid if they were provided as mount * options. */ if (!remount || fsi->opts & BIT(Opt_mode)) { tmp_mode = READ_ONCE(inode->i_mode) & ~S_IALLUGO; tmp_mode |= fsi->mode; WRITE_ONCE(inode->i_mode, tmp_mode); } if (!remount || fsi->opts & BIT(Opt_uid)) inode->i_uid = fsi->uid; if (!remount || fsi->opts & BIT(Opt_gid)) inode->i_gid = fsi->gid; if (remount && (fsi->opts & BIT(Opt_uid) || fsi->opts & BIT(Opt_gid))) { update_uid = fsi->opts & BIT(Opt_uid); update_gid = fsi->opts & BIT(Opt_gid); rcu_read_lock(); list_for_each_entry_rcu(ti, &tracefs_inodes, list) { if (update_uid) { ti->flags &= ~TRACEFS_UID_PERM_SET; ti->vfs_inode.i_uid = fsi->uid; } if (update_gid) { ti->flags &= ~TRACEFS_GID_PERM_SET; ti->vfs_inode.i_gid = fsi->gid; } /* * Note, the above ti->vfs_inode updates are * used in eventfs_remount() so they must come * before calling it. */ if (ti->flags & TRACEFS_EVENT_INODE) eventfs_remount(ti, update_uid, update_gid); } rcu_read_unlock(); } return 0; } static int tracefs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct tracefs_fs_info *sb_opts = sb->s_fs_info; struct tracefs_fs_info *new_opts = fc->s_fs_info; if (!new_opts) return 0; sync_filesystem(sb); /* structure copy of new mount options to sb */ *sb_opts = *new_opts; return tracefs_apply_options(sb, true); } static int tracefs_show_options(struct seq_file *m, struct dentry *root) { struct tracefs_fs_info *fsi = root->d_sb->s_fs_info; if (!uid_eq(fsi->uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, fsi->uid)); if (!gid_eq(fsi->gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, fsi->gid)); if (fsi->mode != TRACEFS_DEFAULT_MODE) seq_printf(m, ",mode=%o", fsi->mode); return 0; } static int tracefs_drop_inode(struct inode *inode) { struct tracefs_inode *ti = get_tracefs(inode); /* * This inode is being freed and cannot be used for * eventfs. Clear the flag so that it doesn't call into * eventfs during the remount flag updates. The eventfs_inode * gets freed after an RCU cycle, so the content will still * be safe if the iteration is going on now. */ ti->flags &= ~TRACEFS_EVENT_INODE; return 1; } static const struct super_operations tracefs_super_operations = { .alloc_inode = tracefs_alloc_inode, .free_inode = tracefs_free_inode, .destroy_inode = tracefs_destroy_inode, .drop_inode = tracefs_drop_inode, .statfs = simple_statfs, .show_options = tracefs_show_options, }; /* * It would be cleaner if eventfs had its own dentry ops. * * Note that d_revalidate is called potentially under RCU, * so it can't take the eventfs mutex etc. It's fine - if * we open a file just as it's marked dead, things will * still work just fine, and just see the old stale case. */ static void tracefs_d_release(struct dentry *dentry) { if (dentry->d_fsdata) eventfs_d_release(dentry); } static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags) { struct eventfs_inode *ei = dentry->d_fsdata; return !(ei && ei->is_freed); } static const struct dentry_operations tracefs_dentry_operations = { .d_revalidate = tracefs_d_revalidate, .d_release = tracefs_d_release, }; static int tracefs_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr trace_files[] = {{""}}; int err; err = simple_fill_super(sb, TRACEFS_MAGIC, trace_files); if (err) return err; sb->s_op = &tracefs_super_operations; sb->s_d_op = &tracefs_dentry_operations; return 0; } static int tracefs_get_tree(struct fs_context *fc) { int err = get_tree_single(fc, tracefs_fill_super); if (err) return err; return tracefs_reconfigure(fc); } static void tracefs_free_fc(struct fs_context *fc) { kfree(fc->s_fs_info); } static const struct fs_context_operations tracefs_context_ops = { .free = tracefs_free_fc, .parse_param = tracefs_parse_param, .get_tree = tracefs_get_tree, .reconfigure = tracefs_reconfigure, }; static int tracefs_init_fs_context(struct fs_context *fc) { struct tracefs_fs_info *fsi; fsi = kzalloc(sizeof(struct tracefs_fs_info), GFP_KERNEL); if (!fsi) return -ENOMEM; fsi->mode = TRACEFS_DEFAULT_MODE; fc->s_fs_info = fsi; fc->ops = &tracefs_context_ops; return 0; } static struct file_system_type trace_fs_type = { .owner = THIS_MODULE, .name = "tracefs", .init_fs_context = tracefs_init_fs_context, .parameters = tracefs_param_specs, .kill_sb = kill_litter_super, }; MODULE_ALIAS_FS("tracefs"); struct dentry *tracefs_start_creating(const char *name, struct dentry *parent) { struct dentry *dentry; int error; pr_debug("tracefs: creating file '%s'\n",name); error = simple_pin_fs(&trace_fs_type, &tracefs_mount, &tracefs_mount_count); if (error) return ERR_PTR(error); /* If the parent is not specified, we create it in the root. * We need the root dentry to do this, which is in the super * block. A pointer to that is in the struct vfsmount that we * have around. */ if (!parent) parent = tracefs_mount->mnt_root; inode_lock(d_inode(parent)); if (unlikely(IS_DEADDIR(d_inode(parent)))) dentry = ERR_PTR(-ENOENT); else dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && d_inode(dentry)) { dput(dentry); dentry = ERR_PTR(-EEXIST); } if (IS_ERR(dentry)) { inode_unlock(d_inode(parent)); simple_release_fs(&tracefs_mount, &tracefs_mount_count); } return dentry; } struct dentry *tracefs_failed_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&tracefs_mount, &tracefs_mount_count); return NULL; } struct dentry *tracefs_end_creating(struct dentry *dentry) { inode_unlock(d_inode(dentry->d_parent)); return dentry; } /* Find the inode that this will use for default */ static struct inode *instance_inode(struct dentry *parent, struct inode *inode) { struct tracefs_inode *ti; /* If parent is NULL then use root inode */ if (!parent) return d_inode(inode->i_sb->s_root); /* Find the inode that is flagged as an instance or the root inode */ while (!IS_ROOT(parent)) { ti = get_tracefs(d_inode(parent)); if (ti->flags & TRACEFS_INSTANCE_INODE) break; parent = parent->d_parent; } return d_inode(parent); } /** * tracefs_create_file - create a file in the tracefs filesystem * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * file will be created in the root of the tracefs filesystem. * @data: a pointer to something that the caller will want to get to later * on. The inode.i_private pointer will point to this value on * the open() call. * @fops: a pointer to a struct file_operations that should be used for * this file. * * This is the basic "create a file" function for tracefs. It allows for a * wide range of flexibility in creating a file, or a directory (if you want * to create a directory, the tracefs_create_dir() function is * recommended to be used instead.) * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the tracefs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, * you are responsible here.) If an error occurs, %NULL will be returned. * * If tracefs is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *tracefs_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops) { struct tracefs_inode *ti; struct dentry *dentry; struct inode *inode; if (security_locked_down(LOCKDOWN_TRACEFS)) return NULL; if (!(mode & S_IFMT)) mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); dentry = tracefs_start_creating(name, parent); if (IS_ERR(dentry)) return NULL; inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) return tracefs_failed_creating(dentry); ti = get_tracefs(inode); ti->private = instance_inode(parent, inode); inode->i_mode = mode; inode->i_op = &tracefs_file_inode_operations; inode->i_fop = fops ? fops : &tracefs_file_operations; inode->i_private = data; inode->i_uid = d_inode(dentry->d_parent)->i_uid; inode->i_gid = d_inode(dentry->d_parent)->i_gid; d_instantiate(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); return tracefs_end_creating(dentry); } static struct dentry *__create_dir(const char *name, struct dentry *parent, const struct inode_operations *ops) { struct tracefs_inode *ti; struct dentry *dentry = tracefs_start_creating(name, parent); struct inode *inode; if (IS_ERR(dentry)) return NULL; inode = tracefs_get_inode(dentry->d_sb); if (unlikely(!inode)) return tracefs_failed_creating(dentry); /* Do not set bits for OTH */ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP; inode->i_op = ops; inode->i_fop = &simple_dir_operations; inode->i_uid = d_inode(dentry->d_parent)->i_uid; inode->i_gid = d_inode(dentry->d_parent)->i_gid; ti = get_tracefs(inode); ti->private = instance_inode(parent, inode); /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return tracefs_end_creating(dentry); } /** * tracefs_create_dir - create a directory in the tracefs filesystem * @name: a pointer to a string containing the name of the directory to * create. * @parent: a pointer to the parent dentry for this file. This should be a * directory dentry if set. If this parameter is NULL, then the * directory will be created in the root of the tracefs filesystem. * * This function creates a directory in tracefs with the given name. * * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the tracefs_remove() function when the file is * to be removed. If an error occurs, %NULL will be returned. * * If tracing is not enabled in the kernel, the value -%ENODEV will be * returned. */ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) { if (security_locked_down(LOCKDOWN_TRACEFS)) return NULL; return __create_dir(name, parent, &tracefs_dir_inode_operations); } /** * tracefs_create_instance_dir - create the tracing instances directory * @name: The name of the instances directory to create * @parent: The parent directory that the instances directory will exist * @mkdir: The function to call when a mkdir is performed. * @rmdir: The function to call when a rmdir is performed. * * Only one instances directory is allowed. * * The instances directory is special as it allows for mkdir and rmdir * to be done by userspace. When a mkdir or rmdir is performed, the inode * locks are released and the methods passed in (@mkdir and @rmdir) are * called without locks and with the name of the directory being created * within the instances directory. * * Returns the dentry of the instances directory. */ __init struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent, int (*mkdir)(const char *name), int (*rmdir)(const char *name)) { struct dentry *dentry; /* Only allow one instance of the instances directory. */ if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir)) return NULL; dentry = __create_dir(name, parent, &tracefs_instance_dir_inode_operations); if (!dentry) return NULL; tracefs_ops.mkdir = mkdir; tracefs_ops.rmdir = rmdir; return dentry; } static void remove_one(struct dentry *victim) { simple_release_fs(&tracefs_mount, &tracefs_mount_count); } /** * tracefs_remove - recursively removes a directory * @dentry: a pointer to a the dentry of the directory to be removed. * * This function recursively removes a directory tree in tracefs that * was previously created with a call to another tracefs function * (like tracefs_create_file() or variants thereof.) */ void tracefs_remove(struct dentry *dentry) { if (IS_ERR_OR_NULL(dentry)) return; simple_pin_fs(&trace_fs_type, &tracefs_mount, &tracefs_mount_count); simple_recursive_removal(dentry, remove_one); simple_release_fs(&tracefs_mount, &tracefs_mount_count); } /** * tracefs_initialized - Tells whether tracefs has been registered */ bool tracefs_initialized(void) { return tracefs_registered; } static void init_once(void *foo) { struct tracefs_inode *ti = (struct tracefs_inode *) foo; /* inode_init_once() calls memset() on the vfs_inode portion */ inode_init_once(&ti->vfs_inode); /* Zero out the rest */ memset_after(ti, 0, vfs_inode); } static int __init tracefs_init(void) { int retval; tracefs_inode_cachep = kmem_cache_create("tracefs_inode_cache", sizeof(struct tracefs_inode), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_ACCOUNT), init_once); if (!tracefs_inode_cachep) return -ENOMEM; retval = sysfs_create_mount_point(kernel_kobj, "tracing"); if (retval) return -EINVAL; retval = register_filesystem(&trace_fs_type); if (!retval) tracefs_registered = true; return retval; } core_initcall(tracefs_init);
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 // SPDX-License-Identifier: GPL-2.0-or-later /* * PCTV 452e DVB driver * * Copyright (c) 2006-2008 Dominik Kuhlen <dkuhlen@gmx.net> * * TT connect S2-3650-CI Common Interface support, MAC readout * Copyright (C) 2008 Michael H. Schimek <mschimek@gmx.at> */ /* dvb usb framework */ #define DVB_USB_LOG_PREFIX "pctv452e" #include "dvb-usb.h" /* Demodulator */ #include "stb0899_drv.h" #include "stb0899_reg.h" #include "stb0899_cfg.h" /* Tuner */ #include "stb6100.h" #include "stb6100_cfg.h" /* FE Power */ #include "isl6423.h" #include "lnbp22.h" #include <media/dvb_ca_en50221.h> #include "ttpci-eeprom.h" #include <linux/etherdevice.h> static int debug; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "Turn on/off debugging (default:off)."); DVB_DEFINE_MOD_OPT_ADAPTER_NR(adapter_nr); #define ISOC_INTERFACE_ALTERNATIVE 3 #define SYNC_BYTE_OUT 0xaa #define SYNC_BYTE_IN 0x55 /* guessed: (copied from ttusb-budget) */ #define PCTV_CMD_RESET 0x15 /* command to poll IR receiver */ #define PCTV_CMD_IR 0x1b /* command to send I2C */ #define PCTV_CMD_I2C 0x31 #define I2C_ADDR_STB0899 (0xd0 >> 1) #define I2C_ADDR_STB6100 (0xc0 >> 1) #define I2C_ADDR_LNBP22 (0x10 >> 1) #define I2C_ADDR_24C16 (0xa0 >> 1) #define I2C_ADDR_24C64 (0xa2 >> 1) /* pctv452e sends us this amount of data for each issued usb-command */ #define PCTV_ANSWER_LEN 64 /* Wait up to 1000ms for device */ #define PCTV_TIMEOUT 1000 #define PCTV_LED_GPIO STB0899_GPIO01 #define PCTV_LED_GREEN 0x82 #define PCTV_LED_ORANGE 0x02 #define ci_dbg(format, arg...) \ do { \ if (0) \ printk(KERN_DEBUG DVB_USB_LOG_PREFIX \ ": " format "\n" , ## arg); \ } while (0) enum { TT3650_CMD_CI_TEST = 0x40, TT3650_CMD_CI_RD_CTRL, TT3650_CMD_CI_WR_CTRL, TT3650_CMD_CI_RD_ATTR, TT3650_CMD_CI_WR_ATTR, TT3650_CMD_CI_RESET, TT3650_CMD_CI_SET_VIDEO_PORT }; static struct stb0899_postproc pctv45e_postproc[] = { { PCTV_LED_GPIO, STB0899_GPIOPULLUP }, { 0, 0 } }; static struct isl6423_config pctv452e_isl6423_config = { .current_max = SEC_CURRENT_515m, .curlim = SEC_CURRENT_LIM_ON, .mod_extern = 1, .addr = 0x08, }; /* * stores all private variables for communication with the PCTV452e DVB-S2 */ struct pctv452e_state { struct dvb_ca_en50221 ca; struct mutex ca_mutex; u8 c; /* transaction counter, wraps around... */ u8 initialized; /* set to 1 if 0x15 has been sent */ u16 last_rc_key; }; static int tt3650_ci_msg(struct dvb_usb_device *d, u8 cmd, u8 *data, unsigned int write_len, unsigned int read_len) { struct pctv452e_state *state = d->priv; u8 *buf; u8 id; unsigned int rlen; int ret; if (!data || (write_len > 64 - 4) || (read_len > 64 - 4)) { err("%s: transfer data invalid", __func__); return -EIO; } buf = kmalloc(64, GFP_KERNEL); if (!buf) return -ENOMEM; id = state->c++; buf[0] = SYNC_BYTE_OUT; buf[1] = id; buf[2] = cmd; buf[3] = write_len; memcpy(buf + 4, data, write_len); rlen = (read_len > 0) ? 64 : 0; ret = dvb_usb_generic_rw(d, buf, 4 + write_len, buf, rlen, /* delay_ms */ 0); if (0 != ret) goto failed; ret = -EIO; if (SYNC_BYTE_IN != buf[0] || id != buf[1]) goto failed; memcpy(data, buf + 4, read_len); kfree(buf); return 0; failed: err("CI error %d; %02X %02X %02X -> %*ph.", ret, SYNC_BYTE_OUT, id, cmd, 3, buf); kfree(buf); return ret; } static int tt3650_ci_msg_locked(struct dvb_ca_en50221 *ca, u8 cmd, u8 *data, unsigned int write_len, unsigned int read_len) { struct dvb_usb_device *d = ca->data; struct pctv452e_state *state = d->priv; int ret; mutex_lock(&state->ca_mutex); ret = tt3650_ci_msg(d, cmd, data, write_len, read_len); mutex_unlock(&state->ca_mutex); return ret; } static int tt3650_ci_read_attribute_mem(struct dvb_ca_en50221 *ca, int slot, int address) { u8 buf[3]; int ret; if (0 != slot) return -EINVAL; buf[0] = (address >> 8) & 0x0F; buf[1] = address; ret = tt3650_ci_msg_locked(ca, TT3650_CMD_CI_RD_ATTR, buf, 2, 3); ci_dbg("%s %04x -> %d 0x%02x", __func__, address, ret, buf[2]); if (ret < 0) return ret; return buf[2]; } static int tt3650_ci_write_attribute_mem(struct dvb_ca_en50221 *ca, int slot, int address, u8 value) { u8 buf[3]; ci_dbg("%s %d 0x%04x 0x%02x", __func__, slot, address, value); if (0 != slot) return -EINVAL; buf[0] = (address >> 8) & 0x0F; buf[1] = address; buf[2] = value; return tt3650_ci_msg_locked(ca, TT3650_CMD_CI_WR_ATTR, buf, 3, 3); } static int tt3650_ci_read_cam_control(struct dvb_ca_en50221 *ca, int slot, u8 address) { u8 buf[2]; int ret; if (0 != slot) return -EINVAL; buf[0] = address & 3; ret = tt3650_ci_msg_locked(ca, TT3650_CMD_CI_RD_CTRL, buf, 1, 2); ci_dbg("%s 0x%02x -> %d 0x%02x", __func__, address, ret, buf[1]); if (ret < 0) return ret; return buf[1]; } static int tt3650_ci_write_cam_control(struct dvb_ca_en50221 *ca, int slot, u8 address, u8 value) { u8 buf[2]; ci_dbg("%s %d 0x%02x 0x%02x", __func__, slot, address, value); if (0 != slot) return -EINVAL; buf[0] = address; buf[1] = value; return tt3650_ci_msg_locked(ca, TT3650_CMD_CI_WR_CTRL, buf, 2, 2); } static int tt3650_ci_set_video_port(struct dvb_ca_en50221 *ca, int slot, int enable) { u8 buf[1]; int ret; ci_dbg("%s %d %d", __func__, slot, enable); if (0 != slot) return -EINVAL; enable = !!enable; buf[0] = enable; ret = tt3650_ci_msg_locked(ca, TT3650_CMD_CI_SET_VIDEO_PORT, buf, 1, 1); if (ret < 0) return ret; if (enable != buf[0]) { err("CI not %sabled.", enable ? "en" : "dis"); return -EIO; } return 0; } static int tt3650_ci_slot_shutdown(struct dvb_ca_en50221 *ca, int slot) { return tt3650_ci_set_video_port(ca, slot, /* enable */ 0); } static int tt3650_ci_slot_ts_enable(struct dvb_ca_en50221 *ca, int slot) { return tt3650_ci_set_video_port(ca, slot, /* enable */ 1); } static int tt3650_ci_slot_reset(struct dvb_ca_en50221 *ca, int slot) { struct dvb_usb_device *d = ca->data; struct pctv452e_state *state = d->priv; u8 buf[1]; int ret; ci_dbg("%s %d", __func__, slot); if (0 != slot) return -EINVAL; buf[0] = 0; mutex_lock(&state->ca_mutex); ret = tt3650_ci_msg(d, TT3650_CMD_CI_RESET, buf, 1, 1); if (0 != ret) goto failed; msleep(500); buf[0] = 1; ret = tt3650_ci_msg(d, TT3650_CMD_CI_RESET, buf, 1, 1); if (0 != ret) goto failed; msleep(500); buf[0] = 0; /* FTA */ ret = tt3650_ci_msg(d, TT3650_CMD_CI_SET_VIDEO_PORT, buf, 1, 1); failed: mutex_unlock(&state->ca_mutex); return ret; } static int tt3650_ci_poll_slot_status(struct dvb_ca_en50221 *ca, int slot, int open) { u8 buf[1]; int ret; if (0 != slot) return -EINVAL; ret = tt3650_ci_msg_locked(ca, TT3650_CMD_CI_TEST, buf, 0, 1); if (0 != ret) return ret; if (1 == buf[0]) return DVB_CA_EN50221_POLL_CAM_PRESENT | DVB_CA_EN50221_POLL_CAM_READY; return 0; } static void tt3650_ci_uninit(struct dvb_usb_device *d) { struct pctv452e_state *state; ci_dbg("%s", __func__); if (NULL == d) return; state = d->priv; if (NULL == state) return; if (NULL == state->ca.data) return; /* Error ignored. */ tt3650_ci_set_video_port(&state->ca, /* slot */ 0, /* enable */ 0); dvb_ca_en50221_release(&state->ca); memset(&state->ca, 0, sizeof(state->ca)); } static int tt3650_ci_init(struct dvb_usb_adapter *a) { struct dvb_usb_device *d = a->dev; struct pctv452e_state *state = d->priv; int ret; ci_dbg("%s", __func__); mutex_init(&state->ca_mutex); state->ca.owner = THIS_MODULE; state->ca.read_attribute_mem = tt3650_ci_read_attribute_mem; state->ca.write_attribute_mem = tt3650_ci_write_attribute_mem; state->ca.read_cam_control = tt3650_ci_read_cam_control; state->ca.write_cam_control = tt3650_ci_write_cam_control; state->ca.slot_reset = tt3650_ci_slot_reset; state->ca.slot_shutdown = tt3650_ci_slot_shutdown; state->ca.slot_ts_enable = tt3650_ci_slot_ts_enable; state->ca.poll_slot_status = tt3650_ci_poll_slot_status; state->ca.data = d; ret = dvb_ca_en50221_init(&a->dvb_adap, &state->ca, /* flags */ 0, /* n_slots */ 1); if (0 != ret) { err("Cannot initialize CI: Error %d.", ret); memset(&state->ca, 0, sizeof(state->ca)); return ret; } info("CI initialized."); return 0; } #define CMD_BUFFER_SIZE 0x28 static int pctv452e_i2c_msg(struct dvb_usb_device *d, u8 addr, const u8 *snd_buf, u8 snd_len, u8 *rcv_buf, u8 rcv_len) { struct pctv452e_state *state = d->priv; u8 *buf; u8 id; int ret; buf = kmalloc(64, GFP_KERNEL); if (!buf) return -ENOMEM; id = state->c++; ret = -EINVAL; if (snd_len > 64 - 7 || rcv_len > 64 - 7) goto failed; buf[0] = SYNC_BYTE_OUT; buf[1] = id; buf[2] = PCTV_CMD_I2C; buf[3] = snd_len + 3; buf[4] = addr << 1; buf[5] = snd_len; buf[6] = rcv_len; memcpy(buf + 7, snd_buf, snd_len); ret = dvb_usb_generic_rw(d, buf, 7 + snd_len, buf, /* rcv_len */ 64, /* delay_ms */ 0); if (ret < 0) goto failed; /* TT USB protocol error. */ ret = -EIO; if (SYNC_BYTE_IN != buf[0] || id != buf[1]) goto failed; /* I2C device didn't respond as expected. */ ret = -EREMOTEIO; if (buf[5] < snd_len || buf[6] < rcv_len) goto failed; memcpy(rcv_buf, buf + 7, rcv_len); kfree(buf); return rcv_len; failed: err("I2C error %d; %02X %02X %02X %02X %02X -> %*ph", ret, SYNC_BYTE_OUT, id, addr << 1, snd_len, rcv_len, 7, buf); kfree(buf); return ret; } static int pctv452e_i2c_xfer(struct i2c_adapter *adapter, struct i2c_msg *msg, int num) { struct dvb_usb_device *d = i2c_get_adapdata(adapter); int i; if (mutex_lock_interruptible(&d->i2c_mutex) < 0) return -EAGAIN; for (i = 0; i < num; i++) { u8 addr, snd_len, rcv_len, *snd_buf, *rcv_buf; int ret; if (msg[i].flags & I2C_M_RD) { addr = msg[i].addr; snd_buf = NULL; snd_len = 0; rcv_buf = msg[i].buf; rcv_len = msg[i].len; } else { addr = msg[i].addr; snd_buf = msg[i].buf; snd_len = msg[i].len; rcv_buf = NULL; rcv_len = 0; } ret = pctv452e_i2c_msg(d, addr, snd_buf, snd_len, rcv_buf, rcv_len); if (ret < rcv_len) break; } mutex_unlock(&d->i2c_mutex); return i; } static u32 pctv452e_i2c_func(struct i2c_adapter *adapter) { return I2C_FUNC_I2C; } static int pctv452e_power_ctrl(struct dvb_usb_device *d, int i) { struct pctv452e_state *state = d->priv; u8 *b0, *rx; int ret; info("%s: %d\n", __func__, i); if (!i) return 0; if (state->initialized) return 0; b0 = kmalloc(5 + PCTV_ANSWER_LEN, GFP_KERNEL); if (!b0) return -ENOMEM; rx = b0 + 5; /* hmm where should this should go? */ ret = usb_set_interface(d->udev, 0, ISOC_INTERFACE_ALTERNATIVE); if (ret != 0) info("%s: Warning set interface returned: %d\n", __func__, ret); /* this is a one-time initialization, don't know where to put */ b0[0] = 0xaa; b0[1] = state->c++; b0[2] = PCTV_CMD_RESET; b0[3] = 1; b0[4] = 0; /* reset board */ ret = dvb_usb_generic_rw(d, b0, 5, rx, PCTV_ANSWER_LEN, 0); if (ret) goto ret; b0[1] = state->c++; b0[4] = 1; /* reset board (again?) */ ret = dvb_usb_generic_rw(d, b0, 5, rx, PCTV_ANSWER_LEN, 0); if (ret) goto ret; state->initialized = 1; ret: kfree(b0); return ret; } static int pctv452e_rc_query(struct dvb_usb_device *d) { struct pctv452e_state *state = d->priv; u8 *b, *rx; int ret, i; u8 id; b = kmalloc(CMD_BUFFER_SIZE + PCTV_ANSWER_LEN, GFP_KERNEL); if (!b) return -ENOMEM; rx = b + CMD_BUFFER_SIZE; id = state->c++; /* prepare command header */ b[0] = SYNC_BYTE_OUT; b[1] = id; b[2] = PCTV_CMD_IR; b[3] = 0; /* send ir request */ ret = dvb_usb_generic_rw(d, b, 4, rx, PCTV_ANSWER_LEN, 0); if (ret != 0) goto ret; if (debug > 3) { info("%s: read: %2d: %*ph: ", __func__, ret, 3, rx); for (i = 0; (i < rx[3]) && ((i+3) < PCTV_ANSWER_LEN); i++) info(" %02x", rx[i+3]); info("\n"); } if ((rx[3] == 9) && (rx[12] & 0x01)) { /* got a "press" event */ state->last_rc_key = RC_SCANCODE_RC5(rx[7], rx[6]); if (debug > 2) info("%s: cmd=0x%02x sys=0x%02x\n", __func__, rx[6], rx[7]); rc_keydown(d->rc_dev, RC_PROTO_RC5, state->last_rc_key, 0); } else if (state->last_rc_key) { rc_keyup(d->rc_dev); state->last_rc_key = 0; } ret: kfree(b); return ret; } static int pctv452e_read_mac_address(struct dvb_usb_device *d, u8 mac[6]) { const u8 mem_addr[] = { 0x1f, 0xcc }; u8 encoded_mac[20]; int ret; ret = -EAGAIN; if (mutex_lock_interruptible(&d->i2c_mutex) < 0) goto failed; ret = pctv452e_i2c_msg(d, I2C_ADDR_24C16, mem_addr + 1, /* snd_len */ 1, encoded_mac, /* rcv_len */ 20); if (-EREMOTEIO == ret) /* Caution! A 24C16 interprets 0xA2 0x1F 0xCC as a byte write if /WC is low. */ ret = pctv452e_i2c_msg(d, I2C_ADDR_24C64, mem_addr, 2, encoded_mac, 20); mutex_unlock(&d->i2c_mutex); if (20 != ret) goto failed; ret = ttpci_eeprom_decode_mac(mac, encoded_mac); if (0 != ret) goto failed; return 0; failed: eth_zero_addr(mac); return ret; } static const struct stb0899_s1_reg pctv452e_init_dev[] = { { STB0899_DISCNTRL1, 0x26 }, { STB0899_DISCNTRL2, 0x80 }, { STB0899_DISRX_ST0, 0x04 }, { STB0899_DISRX_ST1, 0x20 }, { STB0899_DISPARITY, 0x00 }, { STB0899_DISFIFO, 0x00 }, { STB0899_DISF22, 0x99 }, { STB0899_DISF22RX, 0x85 }, /* 0xa8 */ { STB0899_ACRPRESC, 0x11 }, { STB0899_ACRDIV1, 0x0a }, { STB0899_ACRDIV2, 0x05 }, { STB0899_DACR1 , 0x00 }, { STB0899_DACR2 , 0x00 }, { STB0899_OUTCFG, 0x00 }, { STB0899_MODECFG, 0x00 }, /* Inversion */ { STB0899_IRQMSK_3, 0xf3 }, { STB0899_IRQMSK_2, 0xfc }, { STB0899_IRQMSK_1, 0xff }, { STB0899_IRQMSK_0, 0xff }, { STB0899_I2CCFG, 0x88 }, { STB0899_I2CRPT, 0x58 }, { STB0899_GPIO00CFG, 0x82 }, { STB0899_GPIO01CFG, 0x82 }, /* LED: 0x02 green, 0x82 orange */ { STB0899_GPIO02CFG, 0x82 }, { STB0899_GPIO03CFG, 0x82 }, { STB0899_GPIO04CFG, 0x82 }, { STB0899_GPIO05CFG, 0x82 }, { STB0899_GPIO06CFG, 0x82 }, { STB0899_GPIO07CFG, 0x82 }, { STB0899_GPIO08CFG, 0x82 }, { STB0899_GPIO09CFG, 0x82 }, { STB0899_GPIO10CFG, 0x82 }, { STB0899_GPIO11CFG, 0x82 }, { STB0899_GPIO12CFG, 0x82 }, { STB0899_GPIO13CFG, 0x82 }, { STB0899_GPIO14CFG, 0x82 }, { STB0899_GPIO15CFG, 0x82 }, { STB0899_GPIO16CFG, 0x82 }, { STB0899_GPIO17CFG, 0x82 }, { STB0899_GPIO18CFG, 0x82 }, { STB0899_GPIO19CFG, 0x82 }, { STB0899_GPIO20CFG, 0x82 }, { STB0899_SDATCFG, 0xb8 }, { STB0899_SCLTCFG, 0xba }, { STB0899_AGCRFCFG, 0x1c }, /* 0x11 DVB-S; 0x1c DVB-S2 (1c, rjkm) */ { STB0899_GPIO22, 0x82 }, { STB0899_GPIO21, 0x91 }, { STB0899_DIRCLKCFG, 0x82 }, { STB0899_CLKOUT27CFG, 0x7e }, { STB0899_STDBYCFG, 0x82 }, { STB0899_CS0CFG, 0x82 }, { STB0899_CS1CFG, 0x82 }, { STB0899_DISEQCOCFG, 0x20 }, { STB0899_NCOARSE, 0x15 }, /* 0x15 27Mhz, F/3 198MHz, F/6 108MHz */ { STB0899_SYNTCTRL, 0x00 }, /* 0x00 CLKI, 0x02 XTALI */ { STB0899_FILTCTRL, 0x00 }, { STB0899_SYSCTRL, 0x00 }, { STB0899_STOPCLK1, 0x20 }, /* orig: 0x00 budget-ci: 0x20 */ { STB0899_STOPCLK2, 0x00 }, { STB0899_INTBUFCTRL, 0x0a }, { STB0899_AGC2I1, 0x00 }, { STB0899_AGC2I2, 0x00 }, { STB0899_AGCIQIN, 0x00 }, { STB0899_TSTRES, 0x40 }, /* rjkm */ { 0xffff, 0xff }, }; static const struct stb0899_s1_reg pctv452e_init_s1_demod[] = { { STB0899_DEMOD, 0x00 }, { STB0899_RCOMPC, 0xc9 }, { STB0899_AGC1CN, 0x01 }, { STB0899_AGC1REF, 0x10 }, { STB0899_RTC, 0x23 }, { STB0899_TMGCFG, 0x4e }, { STB0899_AGC2REF, 0x34 }, { STB0899_TLSR, 0x84 }, { STB0899_CFD, 0xf7 }, { STB0899_ACLC, 0x87 }, { STB0899_BCLC, 0x94 }, { STB0899_EQON, 0x41 }, { STB0899_LDT, 0xf1 }, { STB0899_LDT2, 0xe3 }, { STB0899_EQUALREF, 0xb4 }, { STB0899_TMGRAMP, 0x10 }, { STB0899_TMGTHD, 0x30 }, { STB0899_IDCCOMP, 0xfd }, { STB0899_QDCCOMP, 0xff }, { STB0899_POWERI, 0x0c }, { STB0899_POWERQ, 0x0f }, { STB0899_RCOMP, 0x6c }, { STB0899_AGCIQIN, 0x80 }, { STB0899_AGC2I1, 0x06 }, { STB0899_AGC2I2, 0x00 }, { STB0899_TLIR, 0x30 }, { STB0899_RTF, 0x7f }, { STB0899_DSTATUS, 0x00 }, { STB0899_LDI, 0xbc }, { STB0899_CFRM, 0xea }, { STB0899_CFRL, 0x31 }, { STB0899_NIRM, 0x2b }, { STB0899_NIRL, 0x80 }, { STB0899_ISYMB, 0x1d }, { STB0899_QSYMB, 0xa6 }, { STB0899_SFRH, 0x2f }, { STB0899_SFRM, 0x68 }, { STB0899_SFRL, 0x40 }, { STB0899_SFRUPH, 0x2f }, { STB0899_SFRUPM, 0x68 }, { STB0899_SFRUPL, 0x40 }, { STB0899_EQUAI1, 0x02 }, { STB0899_EQUAQ1, 0xff }, { STB0899_EQUAI2, 0x04 }, { STB0899_EQUAQ2, 0x05 }, { STB0899_EQUAI3, 0x02 }, { STB0899_EQUAQ3, 0xfd }, { STB0899_EQUAI4, 0x03 }, { STB0899_EQUAQ4, 0x07 }, { STB0899_EQUAI5, 0x08 }, { STB0899_EQUAQ5, 0xf5 }, { STB0899_DSTATUS2, 0x00 }, { STB0899_VSTATUS, 0x00 }, { STB0899_VERROR, 0x86 }, { STB0899_IQSWAP, 0x2a }, { STB0899_ECNT1M, 0x00 }, { STB0899_ECNT1L, 0x00 }, { STB0899_ECNT2M, 0x00 }, { STB0899_ECNT2L, 0x00 }, { STB0899_ECNT3M, 0x0a }, { STB0899_ECNT3L, 0xad }, { STB0899_FECAUTO1, 0x06 }, { STB0899_FECM, 0x01 }, { STB0899_VTH12, 0xb0 }, { STB0899_VTH23, 0x7a }, { STB0899_VTH34, 0x58 }, { STB0899_VTH56, 0x38 }, { STB0899_VTH67, 0x34 }, { STB0899_VTH78, 0x24 }, { STB0899_PRVIT, 0xff }, { STB0899_VITSYNC, 0x19 }, { STB0899_RSULC, 0xb1 }, /* DVB = 0xb1, DSS = 0xa1 */ { STB0899_TSULC, 0x42 }, { STB0899_RSLLC, 0x41 }, { STB0899_TSLPL, 0x12 }, { STB0899_TSCFGH, 0x0c }, { STB0899_TSCFGM, 0x00 }, { STB0899_TSCFGL, 0x00 }, { STB0899_TSOUT, 0x69 }, /* 0x0d for CAM */ { STB0899_RSSYNCDEL, 0x00 }, { STB0899_TSINHDELH, 0x02 }, { STB0899_TSINHDELM, 0x00 }, { STB0899_TSINHDELL, 0x00 }, { STB0899_TSLLSTKM, 0x1b }, { STB0899_TSLLSTKL, 0xb3 }, { STB0899_TSULSTKM, 0x00 }, { STB0899_TSULSTKL, 0x00 }, { STB0899_PCKLENUL, 0xbc }, { STB0899_PCKLENLL, 0xcc }, { STB0899_RSPCKLEN, 0xbd }, { STB0899_TSSTATUS, 0x90 }, { STB0899_ERRCTRL1, 0xb6 }, { STB0899_ERRCTRL2, 0x95 }, { STB0899_ERRCTRL3, 0x8d }, { STB0899_DMONMSK1, 0x27 }, { STB0899_DMONMSK0, 0x03 }, { STB0899_DEMAPVIT, 0x5c }, { STB0899_PLPARM, 0x19 }, { STB0899_PDELCTRL, 0x48 }, { STB0899_PDELCTRL2, 0x00 }, { STB0899_BBHCTRL1, 0x00 }, { STB0899_BBHCTRL2, 0x00 }, { STB0899_HYSTTHRESH, 0x77 }, { STB0899_MATCSTM, 0x00 }, { STB0899_MATCSTL, 0x00 }, { STB0899_UPLCSTM, 0x00 }, { STB0899_UPLCSTL, 0x00 }, { STB0899_DFLCSTM, 0x00 }, { STB0899_DFLCSTL, 0x00 }, { STB0899_SYNCCST, 0x00 }, { STB0899_SYNCDCSTM, 0x00 }, { STB0899_SYNCDCSTL, 0x00 }, { STB0899_ISI_ENTRY, 0x00 }, { STB0899_ISI_BIT_EN, 0x00 }, { STB0899_MATSTRM, 0xf0 }, { STB0899_MATSTRL, 0x02 }, { STB0899_UPLSTRM, 0x45 }, { STB0899_UPLSTRL, 0x60 }, { STB0899_DFLSTRM, 0xe3 }, { STB0899_DFLSTRL, 0x00 }, { STB0899_SYNCSTR, 0x47 }, { STB0899_SYNCDSTRM, 0x05 }, { STB0899_SYNCDSTRL, 0x18 }, { STB0899_CFGPDELSTATUS1, 0x19 }, { STB0899_CFGPDELSTATUS2, 0x2b }, { STB0899_BBFERRORM, 0x00 }, { STB0899_BBFERRORL, 0x01 }, { STB0899_UPKTERRORM, 0x00 }, { STB0899_UPKTERRORL, 0x00 }, { 0xffff, 0xff }, }; static struct stb0899_config stb0899_config = { .init_dev = pctv452e_init_dev, .init_s2_demod = stb0899_s2_init_2, .init_s1_demod = pctv452e_init_s1_demod, .init_s2_fec = stb0899_s2_init_4, .init_tst = stb0899_s1_init_5, .demod_address = I2C_ADDR_STB0899, /* I2C Address */ .block_sync_mode = STB0899_SYNC_FORCED, /* ? */ .xtal_freq = 27000000, /* Assume Hz ? */ .inversion = IQ_SWAP_ON, .lo_clk = 76500000, .hi_clk = 99000000, .ts_output_mode = 0, /* Use parallel mode */ .clock_polarity = 0, .data_clk_parity = 0, .fec_mode = 0, .esno_ave = STB0899_DVBS2_ESNO_AVE, .esno_quant = STB0899_DVBS2_ESNO_QUANT, .avframes_coarse = STB0899_DVBS2_AVFRAMES_COARSE, .avframes_fine = STB0899_DVBS2_AVFRAMES_FINE, .miss_threshold = STB0899_DVBS2_MISS_THRESHOLD, .uwp_threshold_acq = STB0899_DVBS2_UWP_THRESHOLD_ACQ, .uwp_threshold_track = STB0899_DVBS2_UWP_THRESHOLD_TRACK, .uwp_threshold_sof = STB0899_DVBS2_UWP_THRESHOLD_SOF, .sof_search_timeout = STB0899_DVBS2_SOF_SEARCH_TIMEOUT, .btr_nco_bits = STB0899_DVBS2_BTR_NCO_BITS, .btr_gain_shift_offset = STB0899_DVBS2_BTR_GAIN_SHIFT_OFFSET, .crl_nco_bits = STB0899_DVBS2_CRL_NCO_BITS, .ldpc_max_iter = STB0899_DVBS2_LDPC_MAX_ITER, .tuner_get_frequency = stb6100_get_frequency, .tuner_set_frequency = stb6100_set_frequency, .tuner_set_bandwidth = stb6100_set_bandwidth, .tuner_get_bandwidth = stb6100_get_bandwidth, .tuner_set_rfsiggain = NULL, /* helper for switching LED green/orange */ .postproc = pctv45e_postproc }; static struct stb6100_config stb6100_config = { .tuner_address = I2C_ADDR_STB6100, .refclock = 27000000 }; static struct i2c_algorithm pctv452e_i2c_algo = { .master_xfer = pctv452e_i2c_xfer, .functionality = pctv452e_i2c_func }; static int pctv452e_frontend_attach(struct dvb_usb_adapter *a) { struct usb_device_id *id; a->fe_adap[0].fe = dvb_attach(stb0899_attach, &stb0899_config, &a->dev->i2c_adap); if (!a->fe_adap[0].fe) return -ENODEV; id = a->dev->desc->warm_ids[0]; if (id->idVendor == USB_VID_TECHNOTREND && id->idProduct == USB_PID_TECHNOTREND_CONNECT_S2_3650_CI) { if (dvb_attach(lnbp22_attach, a->fe_adap[0].fe, &a->dev->i2c_adap) == NULL) { err("Cannot attach lnbp22\n"); } /* Error ignored. */ tt3650_ci_init(a); } else if (dvb_attach(isl6423_attach, a->fe_adap[0].fe, &a->dev->i2c_adap, &pctv452e_isl6423_config) == NULL) { err("Cannot attach isl6423\n"); } return 0; } static int pctv452e_tuner_attach(struct dvb_usb_adapter *a) { if (!a->fe_adap[0].fe) return -ENODEV; if (dvb_attach(stb6100_attach, a->fe_adap[0].fe, &stb6100_config, &a->dev->i2c_adap) == NULL) { err("%s failed\n", __func__); return -ENODEV; } return 0; } enum { PINNACLE_PCTV_452E, TECHNOTREND_CONNECT_S2_3600, TECHNOTREND_CONNECT_S2_3650_CI, }; static struct usb_device_id pctv452e_usb_table[] = { DVB_USB_DEV(PINNACLE, PINNACLE_PCTV_452E), DVB_USB_DEV(TECHNOTREND, TECHNOTREND_CONNECT_S2_3600), DVB_USB_DEV(TECHNOTREND, TECHNOTREND_CONNECT_S2_3650_CI), { } }; MODULE_DEVICE_TABLE(usb, pctv452e_usb_table); static struct dvb_usb_device_properties pctv452e_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, /* more ? */ .usb_ctrl = DEVICE_SPECIFIC, .size_of_priv = sizeof(struct pctv452e_state), .power_ctrl = pctv452e_power_ctrl, .rc.core = { .rc_codes = RC_MAP_DIB0700_RC5_TABLE, .allowed_protos = RC_PROTO_BIT_RC5, .rc_query = pctv452e_rc_query, .rc_interval = 100, }, .num_adapters = 1, .adapter = {{ .num_frontends = 1, .fe = {{ .frontend_attach = pctv452e_frontend_attach, .tuner_attach = pctv452e_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_ISOC, .count = 4, .endpoint = 0x02, .u = { .isoc = { .framesperurb = 4, .framesize = 940, .interval = 1 } } }, } }, } }, .i2c_algo = &pctv452e_i2c_algo, .generic_bulk_ctrl_endpoint = 1, /* allow generice rw function */ .num_device_descs = 1, .devices = { { .name = "PCTV HDTV USB", .cold_ids = { NULL, NULL }, /* this is a warm only device */ .warm_ids = { &pctv452e_usb_table[PINNACLE_PCTV_452E], NULL } }, { NULL }, } }; static struct dvb_usb_device_properties tt_connect_s2_3600_properties = { .caps = DVB_USB_IS_AN_I2C_ADAPTER, /* more ? */ .usb_ctrl = DEVICE_SPECIFIC, .size_of_priv = sizeof(struct pctv452e_state), .power_ctrl = pctv452e_power_ctrl, .read_mac_address = pctv452e_read_mac_address, .rc.core = { .rc_codes = RC_MAP_TT_1500, .allowed_protos = RC_PROTO_BIT_RC5, .rc_query = pctv452e_rc_query, .rc_interval = 100, }, .num_adapters = 1, .adapter = {{ .num_frontends = 1, .fe = {{ .frontend_attach = pctv452e_frontend_attach, .tuner_attach = pctv452e_tuner_attach, /* parameter for the MPEG2-data transfer */ .stream = { .type = USB_ISOC, .count = 4, .endpoint = 0x02, .u = { .isoc = { .framesperurb = 64, .framesize = 940, .interval = 1 } } }, } }, } }, .i2c_algo = &pctv452e_i2c_algo, .generic_bulk_ctrl_endpoint = 1, /* allow generic rw function*/ .num_device_descs = 2, .devices = { { .name = "Technotrend TT Connect S2-3600", .cold_ids = { NULL, NULL }, /* this is a warm only device */ .warm_ids = { &pctv452e_usb_table[TECHNOTREND_CONNECT_S2_3600], NULL } }, { .name = "Technotrend TT Connect S2-3650-CI", .cold_ids = { NULL, NULL }, .warm_ids = { &pctv452e_usb_table[TECHNOTREND_CONNECT_S2_3650_CI], NULL } }, { NULL }, } }; static void pctv452e_usb_disconnect(struct usb_interface *intf) { struct dvb_usb_device *d = usb_get_intfdata(intf); tt3650_ci_uninit(d); dvb_usb_device_exit(intf); } static int pctv452e_usb_probe(struct usb_interface *intf, const struct usb_device_id *id) { if (0 == dvb_usb_device_init(intf, &pctv452e_properties, THIS_MODULE, NULL, adapter_nr) || 0 == dvb_usb_device_init(intf, &tt_connect_s2_3600_properties, THIS_MODULE, NULL, adapter_nr)) return 0; return -ENODEV; } static struct usb_driver pctv452e_usb_driver = { .name = "pctv452e", .probe = pctv452e_usb_probe, .disconnect = pctv452e_usb_disconnect, .id_table = pctv452e_usb_table, }; module_usb_driver(pctv452e_usb_driver); MODULE_AUTHOR("Dominik Kuhlen <dkuhlen@gmx.net>"); MODULE_AUTHOR("Andre Weidemann <Andre.Weidemann@web.de>"); MODULE_AUTHOR("Michael H. Schimek <mschimek@gmx.at>"); MODULE_DESCRIPTION("Pinnacle PCTV HDTV USB DVB / TT connect S2-3600 Driver"); MODULE_LICENSE("GPL");
1 1 1 1 276 249 31 1 31 37 2 29 44 2 42 44 60 11 48 59 51 9 5 23 2 28 17 11 28 14 3 48 47 1 48 44 13 10 6 5 6 3 46 46 2 1 3 4 6 10 12 1 12 12 12 12 13 2 18 20 7 13 20 20 20 10 1 12 29 29 20 1 1 1 4 1 28 28 4 4 30 21 29 29 63 14 20 4 29 5 16 8 61 1 29 28 29 29 5 3 5 20 3 17 17 5 8 2 2 4 1 4 1 1 6 11 14 128 42 51 34 25 36 3 2 3 2 3 25 56 17 27 5 23 1 2 9 41 30 12 21 2 13 2 1 2 3 9 1 1 1 5 3 2 24 1 15 8 4 4 5 2 5 39 35 4 6 22 12 1 5 28 8 1 11 2 2 21 23 2 1 2 2 27 6 1 19 1 8 1 3 3 58 24 22 14 34 34 34 34 311 309 207 208 91 218 31 7 310 8 1 1 3 3 1 4 1 1 3 1 2 3 2 1 1 1 579 549 34 1 6 5 4 1097 659 363 97 58 46 537 34 105 364 99 36 430 430 430 429 430 428 429 136 2 78 27 14 15 9 15 52 3 5 7 32 15 49 29 20 25 5 2 3 4 10 10 1 49 38 19 28 48 20 28 47 4 48 17 1 10 20 48 48 28 20 48 8 51 51 1 2 102 3 143 3 2 1 1 5 7 5 7 3 4 5 9 9 11 2 35 1 2 249 55 25 23 322 10 56 276 191 153 324 326 2 323 14 42 57 206 7 2 54 259 314 311 4 311 307 167 34 113 241 24 12 244 123 346 15 344 28 1 28 28 26 2 28 26 2 26 2 28 20 8 28 14 7 7 1 2 5 7 6 7 12 4 4 5 12 7 5 12 2 10 1 5 3 2 14 2 1 12 13 1 12 8 3 11 40 1 4 33 26 6 7 28 2 4 26 30 9 4 17 20 3 10 1 10 1 10 3 29 4 2 2 1 4 2 1 1 3 1 1 15 2 1 3 8 2 6 21 5 1 20 5 1 1 1 20 17 9 8 3 2 2 2 1 24 4 4 190 1 89 1 29 1 21 9 1 1 2 1 2 1 1 1 2 1 1 1 2 1 1 1 2 2 1 1 1 1 1 2 2 7 3 4 2 4 1 1 1 41 1 38 2 24 1 1 1 1 1 1 2 1 2 45 1 1 2 2 1 1 2 2 2 2 3 1 1 2 3 3 2 2 2 3 1 2 2 6 28 1589 1588 243 84 8 121 2 4 5 1 44 1 2 41 39 1 1 29 26 5 29 26 4 5 5 5 5 8 1 7 1 2 30 30 46 3 45 2 26 17 1 3 38 1 2 2 1 32 7 3 1 7 13 7 7 26 29 8 22 29 7 26 29 2 8 5 1 4 4 3 2 1 2 2 2 2 2 2 5 5 5 5 5 2 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * PACKET - implements raw packet sockets. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * * Fixes: * Alan Cox : verify_area() now used correctly * Alan Cox : new skbuff lists, look ma no backlogs! * Alan Cox : tidied skbuff lists. * Alan Cox : Now uses generic datagram routines I * added. Also fixed the peek/read crash * from all old Linux datagram code. * Alan Cox : Uses the improved datagram code. * Alan Cox : Added NULL's for socket options. * Alan Cox : Re-commented the code. * Alan Cox : Use new kernel side addressing * Rob Janssen : Correct MTU usage. * Dave Platt : Counter leaks caused by incorrect * interrupt locking and some slightly * dubious gcc output. Can you read * compiler: it said _VOLATILE_ * Richard Kooijman : Timestamp fixes. * Alan Cox : New buffers. Use sk->mac.raw. * Alan Cox : sendmsg/recvmsg support. * Alan Cox : Protocol setting support * Alexey Kuznetsov : Untied from IPv4 stack. * Cyrus Durgin : Fixed kerneld for kmod. * Michal Ostrowski : Module initialization cleanup. * Ulises Alonso : Frame number limit removal and * packet_set_ring memory leak. * Eric Biederman : Allow for > 8 byte hardware addresses. * The convention is that longer addresses * will simply extend the hardware address * byte arrays at the end of sockaddr_ll * and packet_mreq. * Johann Baudy : Added TX RING. * Chetan Loke : Implemented TPACKET_V3 block abstraction * layer. * Copyright (C) 2011, <lokec@ccs.neu.edu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ethtool.h> #include <linux/filter.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/capability.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/wireless.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <asm/page.h> #include <asm/cacheflush.h> #include <asm/io.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/module.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/if_vlan.h> #include <linux/virtio_net.h> #include <linux/errqueue.h> #include <linux/net_tstamp.h> #include <linux/percpu.h> #ifdef CONFIG_INET #include <net/inet_common.h> #endif #include <linux/bpf.h> #include <net/compat.h> #include <linux/netfilter_netdev.h> #include "internal.h" /* Assumptions: - If the device has no dev->header_ops->create, there is no LL header visible above the device. In this case, its hard_header_len should be 0. The device may prepend its own header internally. In this case, its needed_headroom should be set to the space needed for it to add its internal header. For example, a WiFi driver pretending to be an Ethernet driver should set its hard_header_len to be the Ethernet header length, and set its needed_headroom to be (the real WiFi header length - the fake Ethernet header length). - packet socket receives packets with pulled ll header, so that SOCK_RAW should push it back. On receive: ----------- Incoming, dev_has_header(dev) == true mac_header -> ll header data -> data Outgoing, dev_has_header(dev) == true mac_header -> ll header data -> ll header Incoming, dev_has_header(dev) == false mac_header -> data However drivers often make it point to the ll header. This is incorrect because the ll header should be invisible to us. data -> data Outgoing, dev_has_header(dev) == false mac_header -> data. ll header is invisible to us. data -> data Resume If dev_has_header(dev) == false we are unable to restore the ll header, because it is invisible to us. On transmit: ------------ dev_has_header(dev) == true mac_header -> ll header data -> ll header dev_has_header(dev) == false (ll header is invisible to us) mac_header -> data data -> data We should set network_header on output to the correct position, packet classifier depends on it. */ /* Private packet socket structures. */ /* identical to struct packet_mreq except it has * a longer address field. */ struct packet_mreq_max { int mr_ifindex; unsigned short mr_type; unsigned short mr_alen; unsigned char mr_address[MAX_ADDR_LEN]; }; union tpacket_uhdr { struct tpacket_hdr *h1; struct tpacket2_hdr *h2; struct tpacket3_hdr *h3; void *raw; }; static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring); #define V3_ALIGNMENT (8) #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) #define BLK_PLUS_PRIV(sz_of_priv) \ (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) struct packet_sock; static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status); static void packet_increment_head(struct packet_ring_buffer *buff); static int prb_curr_blk_in_use(struct tpacket_block_desc *); static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, struct packet_sock *); static void prb_retire_current_block(struct tpacket_kbdq_core *, struct packet_sock *, unsigned int status); static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, struct tpacket_block_desc *); static void prb_retire_rx_blk_timer_expired(struct timer_list *); static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_clear_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_fill_vlan_info(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void packet_flush_mclist(struct sock *sk); static u16 packet_pick_tx_queue(struct sk_buff *skb); struct packet_skb_cb { union { struct sockaddr_pkt pkt; union { /* Trick: alias skb original length with * ll.sll_family and ll.protocol in order * to save room. */ unsigned int origlen; struct sockaddr_ll ll; }; } sa; }; #define vio_le() virtio_legacy_is_little_endian() #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) #define GET_PBLOCK_DESC(x, bid) \ ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) #define GET_NEXT_PRB_BLK_NUM(x) \ (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ ((x)->kactive_blk_num+1) : 0) static void __fanout_unlink(struct sock *sk, struct packet_sock *po); static void __fanout_link(struct sock *sk, struct packet_sock *po); #ifdef CONFIG_NETFILTER_EGRESS static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb) { struct sk_buff *next, *head = NULL, *tail; int rc; rcu_read_lock(); for (; skb != NULL; skb = next) { next = skb->next; skb_mark_not_on_list(skb); if (!nf_hook_egress(skb, &rc, skb->dev)) continue; if (!head) head = skb; else tail->next = skb; tail = skb; } rcu_read_unlock(); return head; } #endif static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb) { if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS)) return dev_queue_xmit(skb); #ifdef CONFIG_NETFILTER_EGRESS if (nf_hook_egress_active()) { skb = nf_hook_direct_egress(skb); if (!skb) return NET_XMIT_DROP; } #endif return dev_direct_xmit(skb, packet_pick_tx_queue(skb)); } static struct net_device *packet_cached_dev_get(struct packet_sock *po) { struct net_device *dev; rcu_read_lock(); dev = rcu_dereference(po->cached_dev); dev_hold(dev); rcu_read_unlock(); return dev; } static void packet_cached_dev_assign(struct packet_sock *po, struct net_device *dev) { rcu_assign_pointer(po->cached_dev, dev); } static void packet_cached_dev_reset(struct packet_sock *po) { RCU_INIT_POINTER(po->cached_dev, NULL); } static u16 packet_pick_tx_queue(struct sk_buff *skb) { struct net_device *dev = skb->dev; const struct net_device_ops *ops = dev->netdev_ops; int cpu = raw_smp_processor_id(); u16 queue_index; #ifdef CONFIG_XPS skb->sender_cpu = cpu + 1; #endif skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues); if (ops->ndo_select_queue) { queue_index = ops->ndo_select_queue(dev, skb, NULL); queue_index = netdev_cap_txqueue(dev, queue_index); } else { queue_index = netdev_pick_tx(dev, skb, NULL); } return queue_index; } /* __register_prot_hook must be invoked through register_prot_hook * or from a context in which asynchronous accesses to the packet * socket is not possible (packet_create()). */ static void __register_prot_hook(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) { if (po->fanout) __fanout_link(sk, po); else dev_add_pack(&po->prot_hook); sock_hold(sk); packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1); } } static void register_prot_hook(struct sock *sk) { lockdep_assert_held_once(&pkt_sk(sk)->bind_lock); __register_prot_hook(sk); } /* If the sync parameter is true, we will temporarily drop * the po->bind_lock and do a synchronize_net to make sure no * asynchronous packet processing paths still refer to the elements * of po->prot_hook. If the sync parameter is false, it is the * callers responsibility to take care of this. */ static void __unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); lockdep_assert_held_once(&po->bind_lock); packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0); if (po->fanout) __fanout_unlink(sk, po); else __dev_remove_pack(&po->prot_hook); __sock_put(sk); if (sync) { spin_unlock(&po->bind_lock); synchronize_net(); spin_lock(&po->bind_lock); } } static void unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) __unregister_prot_hook(sk, sync); } static inline struct page * __pure pgv_to_page(void *addr) { if (is_vmalloc_addr(addr)) return vmalloc_to_page(addr); return virt_to_page(addr); } static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union tpacket_uhdr h; /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */ h.raw = frame; switch (po->tp_version) { case TPACKET_V1: WRITE_ONCE(h.h1->tp_status, status); flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: WRITE_ONCE(h.h2->tp_status, status); flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: WRITE_ONCE(h.h3->tp_status, status); flush_dcache_page(pgv_to_page(&h.h3->tp_status)); break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); } smp_wmb(); } static int __packet_get_status(const struct packet_sock *po, void *frame) { union tpacket_uhdr h; smp_rmb(); /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */ h.raw = frame; switch (po->tp_version) { case TPACKET_V1: flush_dcache_page(pgv_to_page(&h.h1->tp_status)); return READ_ONCE(h.h1->tp_status); case TPACKET_V2: flush_dcache_page(pgv_to_page(&h.h2->tp_status)); return READ_ONCE(h.h2->tp_status); case TPACKET_V3: flush_dcache_page(pgv_to_page(&h.h3->tp_status)); return READ_ONCE(h.h3->tp_status); default: WARN(1, "TPACKET version not supported.\n"); BUG(); return 0; } } static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts, unsigned int flags) { struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); if (shhwtstamps && (flags & SOF_TIMESTAMPING_RAW_HARDWARE) && ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts)) return TP_STATUS_TS_RAW_HARDWARE; if ((flags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec64_cond(skb_tstamp(skb), ts)) return TP_STATUS_TS_SOFTWARE; return 0; } static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, struct sk_buff *skb) { union tpacket_uhdr h; struct timespec64 ts; __u32 ts_status; if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp)))) return 0; h.raw = frame; /* * versions 1 through 3 overflow the timestamps in y2106, since they * all store the seconds in a 32-bit unsigned integer. * If we create a version 4, that should have a 64-bit timestamp, * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit * nanoseconds. */ switch (po->tp_version) { case TPACKET_V1: h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; break; case TPACKET_V2: h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; break; case TPACKET_V3: h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); } /* one flush is safe, as both fields always lie on the same cacheline */ flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); smp_wmb(); return ts_status; } static void *packet_lookup_frame(const struct packet_sock *po, const struct packet_ring_buffer *rb, unsigned int position, int status) { unsigned int pg_vec_pos, frame_offset; union tpacket_uhdr h; pg_vec_pos = position / rb->frames_per_block; frame_offset = position % rb->frames_per_block; h.raw = rb->pg_vec[pg_vec_pos].buffer + (frame_offset * rb->frame_size); if (status != __packet_get_status(po, h.raw)) return NULL; return h.raw; } static void *packet_current_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { return packet_lookup_frame(po, rb, rb->head, status); } static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) { struct vlan_hdr vhdr, *vh; unsigned int header_len; if (!dev) return 0; /* In the SOCK_DGRAM scenario, skb data starts at the network * protocol, which is after the VLAN headers. The outer VLAN * header is at the hard_header_len offset in non-variable * length link layer headers. If it's a VLAN device, the * min_header_len should be used to exclude the VLAN header * size. */ if (dev->min_header_len == dev->hard_header_len) header_len = dev->hard_header_len; else if (is_vlan_dev(dev)) header_len = dev->min_header_len; else return 0; vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len, sizeof(vhdr), &vhdr); if (unlikely(!vh)) return 0; return ntohs(vh->h_vlan_TCI); } static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) { __be16 proto = skb->protocol; if (unlikely(eth_type_vlan(proto))) proto = __vlan_get_protocol_offset(skb, proto, skb_mac_offset(skb), NULL); return proto; } static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) { del_timer_sync(&pkc->retire_blk_timer); } static void prb_shutdown_retire_blk_timer(struct packet_sock *po, struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); spin_lock_bh(&rb_queue->lock); pkc->delete_blk_timer = 1; spin_unlock_bh(&rb_queue->lock); prb_del_retire_blk_timer(pkc); } static void prb_setup_retire_blk_timer(struct packet_sock *po) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired, 0); pkc->retire_blk_timer.expires = jiffies; } static int prb_calc_retire_blk_tmo(struct packet_sock *po, int blk_size_in_bytes) { struct net_device *dev; unsigned int mbits, div; struct ethtool_link_ksettings ecmd; int err; rtnl_lock(); dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); if (unlikely(!dev)) { rtnl_unlock(); return DEFAULT_PRB_RETIRE_TOV; } err = __ethtool_get_link_ksettings(dev, &ecmd); rtnl_unlock(); if (err) return DEFAULT_PRB_RETIRE_TOV; /* If the link speed is so slow you don't really * need to worry about perf anyways */ if (ecmd.base.speed < SPEED_1000 || ecmd.base.speed == SPEED_UNKNOWN) return DEFAULT_PRB_RETIRE_TOV; div = ecmd.base.speed / 1000; mbits = (blk_size_in_bytes * 8) / (1024 * 1024); if (div) mbits /= div; if (div) return mbits + 1; return mbits; } static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, union tpacket_req_u *req_u) { p1->feature_req_word = req_u->req3.tp_feature_req_word; } static void init_prb_bdqc(struct packet_sock *po, struct packet_ring_buffer *rb, struct pgv *pg_vec, union tpacket_req_u *req_u) { struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd; memset(p1, 0x0, sizeof(*p1)); p1->knxt_seq_num = 1; p1->pkbdq = pg_vec; pbd = (struct tpacket_block_desc *)pg_vec[0].buffer; p1->pkblk_start = pg_vec[0].buffer; p1->kblk_size = req_u->req3.tp_block_size; p1->knum_blocks = req_u->req3.tp_block_nr; p1->hdrlen = po->tp_hdrlen; p1->version = po->tp_version; p1->last_kactive_blk_num = 0; po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; else p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, req_u->req3.tp_block_size); p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); prb_setup_retire_blk_timer(po); prb_open_block(p1, pbd); } /* Do NOT update the last_blk_num first. * Assumes sk_buff_head lock is held. */ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) { mod_timer(&pkc->retire_blk_timer, jiffies + pkc->tov_in_jiffies); pkc->last_kactive_blk_num = pkc->kactive_blk_num; } /* * Timer logic: * 1) We refresh the timer only when we open a block. * By doing this we don't waste cycles refreshing the timer * on packet-by-packet basis. * * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * * So, if the user sets the 'tmo' to 10ms then the timer * will never fire while the block is still getting filled * (which is what we want). However, the user could choose * to close a block early and that's fine. * * But when the timer does fire, we check whether or not to refresh it. * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. * */ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) { struct packet_sock *po = from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer); struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); unsigned int frozen; struct tpacket_block_desc *pbd; spin_lock(&po->sk.sk_receive_queue.lock); frozen = prb_queue_frozen(pkc); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); if (unlikely(pkc->delete_blk_timer)) goto out; /* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() * copy_bits() is in progress ... * timer fires on other cpu: * we can't retire the current block because copy_bits * is in progress. * */ if (BLOCK_NUM_PKTS(pbd)) { /* Waiting for skb_copy_bits to finish... */ write_lock(&pkc->blk_fill_in_prog_lock); write_unlock(&pkc->blk_fill_in_prog_lock); } if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { if (!frozen) { if (!BLOCK_NUM_PKTS(pbd)) { /* An empty block. Just refresh the timer. */ goto refresh_timer; } prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); if (!prb_dispatch_next_block(pkc, po)) goto refresh_timer; else goto out; } else { /* Case 1. Queue was frozen because user-space was * lagging behind. */ if (prb_curr_blk_in_use(pbd)) { /* * Ok, user-space is still behind. * So just refresh the timer. */ goto refresh_timer; } else { /* Case 2. queue was frozen,user-space caught up, * now the link went idle && the timer fired. * We don't have a block to close.So we open this * block and restart the timer. * opening a block thaws the queue,restarts timer * Thawing/timer-refresh is a side effect. */ prb_open_block(pkc, pbd); goto out; } } } refresh_timer: _prb_refresh_rx_retire_blk_timer(pkc); out: spin_unlock(&po->sk.sk_receive_queue.lock); } static void prb_flush_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, __u32 status) { /* Flush everything minus the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 u8 *start, *end; start = (u8 *)pbd1; /* Skip the block header(we know header WILL fit in 4K) */ start += PAGE_SIZE; end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); for (; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif /* Now update the block status. */ BLOCK_STATUS(pbd1) = status; /* Flush the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 start = (u8 *)pbd1; flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif } /* * Side effect: * * 1) flush the block * 2) Increment active_blk_num * * Note:We DONT refresh the timer on purpose. * Because almost always the next block will be opened. */ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, struct packet_sock *po, unsigned int stat) { __u32 status = TP_STATUS_USER | stat; struct tpacket3_hdr *last_pkt; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; struct sock *sk = &po->sk; if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; last_pkt = (struct tpacket3_hdr *)pkc1->prev; last_pkt->tp_next_offset = 0; /* Get the ts of the last pkt */ if (BLOCK_NUM_PKTS(pbd1)) { h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; } else { /* Ok, we tmo'd - so get the current time. * * It shouldn't really happen as we don't close empty * blocks. See prb_retire_rx_blk_timer_expired(). */ struct timespec64 ts; ktime_get_real_ts64(&ts); h1->ts_last_pkt.ts_sec = ts.tv_sec; h1->ts_last_pkt.ts_nsec = ts.tv_nsec; } smp_wmb(); /* Flush the block */ prb_flush_block(pkc1, pbd1, status); sk->sk_data_ready(sk); pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); } static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) { pkc->reset_pending_on_curr_blk = 0; } /* * Side effect of opening a block: * * 1) prb_queue is thawed. * 2) retire_blk_timer is refreshed. * */ static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) { struct timespec64 ts; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; smp_rmb(); /* We could have just memset this but we will lose the * flexibility of making the priv area sticky */ BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; BLOCK_NUM_PKTS(pbd1) = 0; BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); ktime_get_real_ts64(&ts); h1->ts_first_pkt.ts_sec = ts.tv_sec; h1->ts_first_pkt.ts_nsec = ts.tv_nsec; pkc1->pkblk_start = (char *)pbd1; pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; pbd1->version = pkc1->version; pkc1->prev = pkc1->nxt_offset; pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; prb_thaw_queue(pkc1); _prb_refresh_rx_retire_blk_timer(pkc1); smp_wmb(); } /* * Queue freeze logic: * 1) Assume tp_block_nr = 8 blocks. * 2) At time 't0', user opens Rx ring. * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 * 4) user-space is either sleeping or processing block '0'. * 5) tpacket_rcv is currently filling block '7', since there is no space left, * it will close block-7,loop around and try to fill block '0'. * call-flow: * __packet_lookup_frame_in_block * prb_retire_current_block() * prb_dispatch_next_block() * |->(BLOCK_STATUS == USER) evaluates to true * 5.1) Since block-0 is currently in-use, we just freeze the queue. * 6) Now there are two cases: * 6.1) Link goes idle right after the queue is frozen. * But remember, the last open_block() refreshed the timer. * When this timer expires,it will refresh itself so that we can * re-open block-0 in near future. * 6.2) Link is busy and keeps on receiving packets. This is a simple * case and __packet_lookup_frame_in_block will check if block-0 * is free and can now be re-used. */ static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { pkc->reset_pending_on_curr_blk = 1; po->stats.stats3.tp_freeze_q_cnt++; } #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) /* * If the next block is free then we will dispatch it * and return a good offset. * Else, we will freeze the queue. * So, caller must check the return value. */ static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { struct tpacket_block_desc *pbd; smp_rmb(); /* 1. Get current block num */ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* 2. If this block is currently in_use then freeze the queue */ if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { prb_freeze_queue(pkc, po); return NULL; } /* * 3. * open this block and return the offset where the first packet * needs to get stored. */ prb_open_block(pkc, pbd); return (void *)pkc->nxt_offset; } static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po, unsigned int status) { struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* retire/close the current block */ if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { /* * Plug the case where copy_bits() is in progress on * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't * have space to copy the pkt in the current block and * called prb_retire_current_block() * * We don't need to worry about the TMO case because * the timer-handler already handled this case. */ if (!(status & TP_STATUS_BLK_TMO)) { /* Waiting for skb_copy_bits to finish... */ write_lock(&pkc->blk_fill_in_prog_lock); write_unlock(&pkc->blk_fill_in_prog_lock); } prb_close_block(pkc, pbd, po, status); return; } } static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd) { return TP_STATUS_USER & BLOCK_STATUS(pbd); } static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) { return pkc->reset_pending_on_curr_blk; } static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) __releases(&pkc->blk_fill_in_prog_lock) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); read_unlock(&pkc->blk_fill_in_prog_lock); } static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb); } static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = 0; } static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { struct packet_sock *po = container_of(pkc, struct packet_sock, rx_ring.prb_bdqc); if (skb_vlan_tag_present(pkc->skb)) { ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb); ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(po->sk.sk_type == SOCK_DGRAM && eth_type_vlan(pkc->skb->protocol))) { ppd->hv1.tp_vlan_tci = vlan_get_tci(pkc->skb, pkc->skb->dev); ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->protocol); ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { ppd->hv1.tp_vlan_tci = 0; ppd->hv1.tp_vlan_tpid = 0; ppd->tp_status = TP_STATUS_AVAILABLE; } } static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_padding = 0; prb_fill_vlan_info(pkc, ppd); if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) prb_fill_rxhash(pkc, ppd); else prb_clear_rxhash(pkc, ppd); } static void prb_fill_curr_block(char *curr, struct tpacket_kbdq_core *pkc, struct tpacket_block_desc *pbd, unsigned int len) __acquires(&pkc->blk_fill_in_prog_lock) { struct tpacket3_hdr *ppd; ppd = (struct tpacket3_hdr *)curr; ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); pkc->prev = curr; pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_NUM_PKTS(pbd) += 1; read_lock(&pkc->blk_fill_in_prog_lock); prb_run_all_ft_ops(pkc, ppd); } /* Assumes caller has the sk->rx_queue.lock */ static void *__packet_lookup_frame_in_block(struct packet_sock *po, struct sk_buff *skb, unsigned int len ) { struct tpacket_kbdq_core *pkc; struct tpacket_block_desc *pbd; char *curr, *end; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* Queue is frozen when user space is lagging behind */ if (prb_queue_frozen(pkc)) { /* * Check if that last block which caused the queue to freeze, * is still in_use by user-space. */ if (prb_curr_blk_in_use(pbd)) { /* Can't record this packet */ return NULL; } else { /* * Ok, the block was released by user-space. * Now let's open that block. * opening a block also thaws the queue. * Thawing is a side effect. */ prb_open_block(pkc, pbd); } } smp_mb(); curr = pkc->nxt_offset; pkc->skb = skb; end = (char *)pbd + pkc->kblk_size; /* first try the current block */ if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* Ok, close the current block */ prb_retire_current_block(pkc, po, 0); /* Now, try to dispatch the next block */ curr = (char *)prb_dispatch_next_block(pkc, po); if (curr) { pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* * No free blocks are available.user_space hasn't caught up yet. * Queue was just frozen and now this packet will get dropped. */ return NULL; } static void *packet_current_rx_frame(struct packet_sock *po, struct sk_buff *skb, int status, unsigned int len) { char *curr = NULL; switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: curr = packet_lookup_frame(po, &po->rx_ring, po->rx_ring.head, status); return curr; case TPACKET_V3: return __packet_lookup_frame_in_block(po, skb, len); default: WARN(1, "TPACKET version not supported\n"); BUG(); return NULL; } } static void *prb_lookup_block(const struct packet_sock *po, const struct packet_ring_buffer *rb, unsigned int idx, int status) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); if (status != BLOCK_STATUS(pbd)) return NULL; return pbd; } static int prb_previous_blk_num(struct packet_ring_buffer *rb) { unsigned int prev; if (rb->prb_bdqc.kactive_blk_num) prev = rb->prb_bdqc.kactive_blk_num-1; else prev = rb->prb_bdqc.knum_blocks-1; return prev; } /* Assumes caller has held the rx_queue.lock */ static void *__prb_previous_block(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = prb_previous_blk_num(rb); return prb_lookup_block(po, rb, previous, status); } static void *packet_previous_rx_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { if (po->tp_version <= TPACKET_V2) return packet_previous_frame(po, rb, status); return __prb_previous_block(po, rb, status); } static void packet_increment_rx_head(struct packet_sock *po, struct packet_ring_buffer *rb) { switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: return packet_increment_head(rb); case TPACKET_V3: default: WARN(1, "TPACKET version not supported.\n"); BUG(); return; } } static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max; return packet_lookup_frame(po, rb, previous, status); } static void packet_increment_head(struct packet_ring_buffer *buff) { buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; } static void packet_inc_pending(struct packet_ring_buffer *rb) { this_cpu_inc(*rb->pending_refcnt); } static void packet_dec_pending(struct packet_ring_buffer *rb) { this_cpu_dec(*rb->pending_refcnt); } static unsigned int packet_read_pending(const struct packet_ring_buffer *rb) { unsigned int refcnt = 0; int cpu; /* We don't use pending refcount in rx_ring. */ if (rb->pending_refcnt == NULL) return 0; for_each_possible_cpu(cpu) refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu); return refcnt; } static int packet_alloc_pending(struct packet_sock *po) { po->rx_ring.pending_refcnt = NULL; po->tx_ring.pending_refcnt = alloc_percpu(unsigned int); if (unlikely(po->tx_ring.pending_refcnt == NULL)) return -ENOBUFS; return 0; } static void packet_free_pending(struct packet_sock *po) { free_percpu(po->tx_ring.pending_refcnt); } #define ROOM_POW_OFF 2 #define ROOM_NONE 0x0 #define ROOM_LOW 0x1 #define ROOM_NORMAL 0x2 static bool __tpacket_has_room(const struct packet_sock *po, int pow_off) { int idx, len; len = READ_ONCE(po->rx_ring.frame_max) + 1; idx = READ_ONCE(po->rx_ring.head); if (pow_off) idx += len >> pow_off; if (idx >= len) idx -= len; return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off) { int idx, len; len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks); idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num); if (pow_off) idx += len >> pow_off; if (idx >= len) idx -= len; return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } static int __packet_rcv_has_room(const struct packet_sock *po, const struct sk_buff *skb) { const struct sock *sk = &po->sk; int ret = ROOM_NONE; if (po->prot_hook.func != tpacket_rcv) { int rcvbuf = READ_ONCE(sk->sk_rcvbuf); int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc) - (skb ? skb->truesize : 0); if (avail > (rcvbuf >> ROOM_POW_OFF)) return ROOM_NORMAL; else if (avail > 0) return ROOM_LOW; else return ROOM_NONE; } if (po->tp_version == TPACKET_V3) { if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; else if (__tpacket_v3_has_room(po, 0)) ret = ROOM_LOW; } else { if (__tpacket_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; else if (__tpacket_has_room(po, 0)) ret = ROOM_LOW; } return ret; } static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { bool pressure; int ret; ret = __packet_rcv_has_room(po, skb); pressure = ret != ROOM_NORMAL; if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure) packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure); return ret; } static void packet_rcv_try_clear_pressure(struct packet_sock *po) { if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false); } static void packet_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_error_queue); WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive packet socket: %p\n", sk); return; } } static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) { u32 *history = po->rollover->history; u32 victim, rxhash; int i, count = 0; rxhash = skb_get_hash(skb); for (i = 0; i < ROLLOVER_HLEN; i++) if (READ_ONCE(history[i]) == rxhash) count++; victim = get_random_u32_below(ROLLOVER_HLEN); /* Avoid dirtying the cache line if possible */ if (READ_ONCE(history[victim]) != rxhash) WRITE_ONCE(history[victim], rxhash); return count > (ROLLOVER_HLEN >> 1); } static unsigned int fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return reciprocal_scale(__skb_get_hash_symmetric(skb), num); } static unsigned int fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { unsigned int val = atomic_inc_return(&f->rr_cur); return val % num; } static unsigned int fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return smp_processor_id() % num; } static unsigned int fanout_demux_rnd(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return get_random_u32_below(num); } static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, unsigned int idx, bool try_self, unsigned int num) { struct packet_sock *po, *po_next, *po_skip = NULL; unsigned int i, j, room = ROOM_NONE; po = pkt_sk(rcu_dereference(f->arr[idx])); if (try_self) { room = packet_rcv_has_room(po, skb); if (room == ROOM_NORMAL || (room == ROOM_LOW && !fanout_flow_is_huge(po, skb))) return idx; po_skip = po; } i = j = min_t(int, po->rollover->sock, num - 1); do { po_next = pkt_sk(rcu_dereference(f->arr[i])); if (po_next != po_skip && !packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; atomic_long_inc(&po->rollover->num); if (room == ROOM_LOW) atomic_long_inc(&po->rollover->num_huge); return i; } if (++i == num) i = 0; } while (i != j); atomic_long_inc(&po->rollover->num_failed); return idx; } static unsigned int fanout_demux_qm(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return skb_get_queue_mapping(skb) % num; } static unsigned int fanout_demux_bpf(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { struct bpf_prog *prog; unsigned int ret = 0; rcu_read_lock(); prog = rcu_dereference(f->bpf_prog); if (prog) ret = bpf_prog_run_clear_cb(prog, skb) % num; rcu_read_unlock(); return ret; } static bool fanout_has_flag(struct packet_fanout *f, u16 flag) { return f->flags & (flag >> 8); } static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct packet_fanout *f = pt->af_packet_priv; unsigned int num = READ_ONCE(f->num_members); struct net *net = read_pnet(&f->net); struct packet_sock *po; unsigned int idx; if (!net_eq(dev_net(dev), net) || !num) { kfree_skb(skb); return 0; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET); if (!skb) return 0; } switch (f->type) { case PACKET_FANOUT_HASH: default: idx = fanout_demux_hash(f, skb, num); break; case PACKET_FANOUT_LB: idx = fanout_demux_lb(f, skb, num); break; case PACKET_FANOUT_CPU: idx = fanout_demux_cpu(f, skb, num); break; case PACKET_FANOUT_RND: idx = fanout_demux_rnd(f, skb, num); break; case PACKET_FANOUT_QM: idx = fanout_demux_qm(f, skb, num); break; case PACKET_FANOUT_ROLLOVER: idx = fanout_demux_rollover(f, skb, 0, false, num); break; case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: idx = fanout_demux_bpf(f, skb, num); break; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) idx = fanout_demux_rollover(f, skb, idx, true, num); po = pkt_sk(rcu_dereference(f->arr[idx])); return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } DEFINE_MUTEX(fanout_mutex); EXPORT_SYMBOL_GPL(fanout_mutex); static LIST_HEAD(fanout_list); static u16 fanout_next_id; static void __fanout_link(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; spin_lock(&f->lock); rcu_assign_pointer(f->arr[f->num_members], sk); smp_wmb(); f->num_members++; if (f->num_members == 1) dev_add_pack(&f->prot_hook); spin_unlock(&f->lock); } static void __fanout_unlink(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; int i; spin_lock(&f->lock); for (i = 0; i < f->num_members; i++) { if (rcu_dereference_protected(f->arr[i], lockdep_is_held(&f->lock)) == sk) break; } BUG_ON(i >= f->num_members); rcu_assign_pointer(f->arr[i], rcu_dereference_protected(f->arr[f->num_members - 1], lockdep_is_held(&f->lock))); f->num_members--; if (f->num_members == 0) __dev_remove_pack(&f->prot_hook); spin_unlock(&f->lock); } static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) { if (sk->sk_family != PF_PACKET) return false; return ptype->af_packet_priv == pkt_sk(sk)->fanout; } static void fanout_init_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_LB: atomic_set(&f->rr_cur, 0); break; case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: RCU_INIT_POINTER(f->bpf_prog, NULL); break; } } static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) { struct bpf_prog *old; spin_lock(&f->lock); old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock)); rcu_assign_pointer(f->bpf_prog, new); spin_unlock(&f->lock); if (old) { synchronize_net(); bpf_prog_destroy(old); } } static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; struct sock_fprog fprog; int ret; if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; ret = copy_bpf_fprog_from_user(&fprog, data, len); if (ret) return ret; ret = bpf_prog_create_from_user(&new, &fprog, NULL, false); if (ret) return ret; __fanout_set_data_bpf(po->fanout, new); return 0; } static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; u32 fd; if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; if (len != sizeof(fd)) return -EINVAL; if (copy_from_sockptr(&fd, data, len)) return -EFAULT; new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(new)) return PTR_ERR(new); __fanout_set_data_bpf(po->fanout, new); return 0; } static int fanout_set_data(struct packet_sock *po, sockptr_t data, unsigned int len) { switch (po->fanout->type) { case PACKET_FANOUT_CBPF: return fanout_set_data_cbpf(po, data, len); case PACKET_FANOUT_EBPF: return fanout_set_data_ebpf(po, data, len); default: return -EINVAL; } } static void fanout_release_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: __fanout_set_data_bpf(f, NULL); } } static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id) { struct packet_fanout *f; list_for_each_entry(f, &fanout_list, list) { if (f->id == candidate_id && read_pnet(&f->net) == sock_net(sk)) { return false; } } return true; } static bool fanout_find_new_id(struct sock *sk, u16 *new_id) { u16 id = fanout_next_id; do { if (__fanout_id_is_free(sk, id)) { *new_id = id; fanout_next_id = id + 1; return true; } id++; } while (id != fanout_next_id); return false; } static int fanout_add(struct sock *sk, struct fanout_args *args) { struct packet_rollover *rollover = NULL; struct packet_sock *po = pkt_sk(sk); u16 type_flags = args->type_flags; struct packet_fanout *f, *match; u8 type = type_flags & 0xff; u8 flags = type_flags >> 8; u16 id = args->id; int err; switch (type) { case PACKET_FANOUT_ROLLOVER: if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) return -EINVAL; break; case PACKET_FANOUT_HASH: case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: case PACKET_FANOUT_RND: case PACKET_FANOUT_QM: case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: break; default: return -EINVAL; } mutex_lock(&fanout_mutex); err = -EALREADY; if (po->fanout) goto out; if (type == PACKET_FANOUT_ROLLOVER || (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { err = -ENOMEM; rollover = kzalloc(sizeof(*rollover), GFP_KERNEL); if (!rollover) goto out; atomic_long_set(&rollover->num, 0); atomic_long_set(&rollover->num_huge, 0); atomic_long_set(&rollover->num_failed, 0); } if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { if (id != 0) { err = -EINVAL; goto out; } if (!fanout_find_new_id(sk, &id)) { err = -ENOMEM; goto out; } /* ephemeral flag for the first socket in the group: drop it */ flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8); } match = NULL; list_for_each_entry(f, &fanout_list, list) { if (f->id == id && read_pnet(&f->net) == sock_net(sk)) { match = f; break; } } err = -EINVAL; if (match) { if (match->flags != flags) goto out; if (args->max_num_members && args->max_num_members != match->max_num_members) goto out; } else { if (args->max_num_members > PACKET_FANOUT_MAX) goto out; if (!args->max_num_members) /* legacy PACKET_FANOUT_MAX */ args->max_num_members = 256; err = -ENOMEM; match = kvzalloc(struct_size(match, arr, args->max_num_members), GFP_KERNEL); if (!match) goto out; write_pnet(&match->net, sock_net(sk)); match->id = id; match->type = type; match->flags = flags; INIT_LIST_HEAD(&match->list); spin_lock_init(&match->lock); refcount_set(&match->sk_ref, 0); fanout_init_data(match); match->prot_hook.type = po->prot_hook.type; match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; match->prot_hook.af_packet_net = read_pnet(&match->net); match->prot_hook.id_match = match_fanout_group; match->max_num_members = args->max_num_members; match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING; list_add(&match->list, &fanout_list); } err = -EINVAL; spin_lock(&po->bind_lock); if (po->num && match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; if (refcount_read(&match->sk_ref) < match->max_num_members) { /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ WRITE_ONCE(po->fanout, match); po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { __dev_remove_pack(&po->prot_hook); __fanout_link(sk, po); } err = 0; } } spin_unlock(&po->bind_lock); if (err && !refcount_read(&match->sk_ref)) { list_del(&match->list); kvfree(match); } out: kfree(rollover); mutex_unlock(&fanout_mutex); return err; } /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout. * It is the responsibility of the caller to call fanout_release_data() and * free the returned packet_fanout (after synchronize_net()) */ static struct packet_fanout *fanout_release(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f; mutex_lock(&fanout_mutex); f = po->fanout; if (f) { po->fanout = NULL; if (refcount_dec_and_test(&f->sk_ref)) list_del(&f->list); else f = NULL; } mutex_unlock(&fanout_mutex); return f; } static bool packet_extra_vlan_len_allowed(const struct net_device *dev, struct sk_buff *skb) { /* Earlier code assumed this would be a VLAN pkt, double-check * this now that we have the actual packet in hand. We can only * do this check on Ethernet devices. */ if (unlikely(dev->type != ARPHRD_ETHER)) return false; skb_reset_mac_header(skb); return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q)); } static const struct proto_ops packet_ops; static const struct proto_ops packet_ops_spkt; static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_pkt *spkt; /* * When we registered the protocol we saved the socket in the data * field for just this event. */ sk = pt->af_packet_priv; /* * Yank back the headers [hope the device set this * right or kerboom...] * * Incoming packets have ll header pulled, * push it back. * * For outgoing ones skb->data == skb_mac_header(skb) * so that this procedure is noop. */ if (skb->pkt_type == PACKET_LOOPBACK) goto out; if (!net_eq(dev_net(dev), sock_net(sk))) goto out; skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) goto oom; /* drop any routing info */ skb_dst_drop(skb); /* drop conntrack reference */ nf_reset_ct(skb); spkt = &PACKET_SKB_CB(skb)->sa.pkt; skb_push(skb, skb->data - skb_mac_header(skb)); /* * The SOCK_PACKET socket receives _all_ frames. */ spkt->spkt_family = dev->type; strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); spkt->spkt_protocol = skb->protocol; /* * Charge the memory to the socket. This is done specifically * to prevent sockets using all the memory up. */ if (sock_queue_rcv_skb(sk, skb) == 0) return 0; out: kfree_skb(skb); oom: return 0; } static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) { int depth; if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) && sock->type == SOCK_RAW) { skb_reset_mac_header(skb); skb->protocol = dev_parse_header_protocol(skb); } /* Move network header to the right position for VLAN tagged packets */ if (likely(skb->dev->type == ARPHRD_ETHER) && eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); skb_probe_transport_header(skb); } /* * Output a raw packet to a device layer. This bypasses all the other * protocol layers and you must therefore supply it with a complete frame */ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); struct sk_buff *skb = NULL; struct net_device *dev; struct sockcm_cookie sockc; __be16 proto = 0; int err; int extra_len = 0; /* * Get and verify the address. */ if (saddr) { if (msg->msg_namelen < sizeof(struct sockaddr)) return -EINVAL; if (msg->msg_namelen == sizeof(struct sockaddr_pkt)) proto = saddr->spkt_protocol; } else return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */ /* * Find the device first to size check it */ saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0; retry: rcu_read_lock(); dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); err = -ENODEV; if (dev == NULL) goto out_unlock; err = -ENETDOWN; if (!(dev->flags & IFF_UP)) goto out_unlock; /* * You may not queue a frame bigger than the mtu. This is the lowest level * raw protocol and you must do your own fragmentation at this level. */ if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len) goto out_unlock; if (!skb) { size_t reserved = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; rcu_read_unlock(); skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); if (skb == NULL) return -ENOBUFS; /* FIXME: Save some space for broken drivers that write a hard * header at transmission time by themselves. PPP is the notable * one here. This should really be fixed at the driver level. */ skb_reserve(skb, reserved); skb_reset_network_header(skb); /* Try to align data part correctly */ if (hhlen) { skb->data -= hhlen; skb->tail -= hhlen; if (len < hhlen) skb_reset_network_header(skb); } err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err) goto out_free; goto retry; } if (!dev_validate_header(dev, skb->data, len) || !skb->len) { err = -EINVAL; goto out_unlock; } if (len > (dev->mtu + dev->hard_header_len + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_unlock; } sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto out_unlock; } skb->protocol = proto; skb->dev = dev; skb->priority = READ_ONCE(sk->sk_priority); skb->mark = READ_ONCE(sk->sk_mark); skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); skb_setup_tx_timestamp(skb, &sockc); if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); dev_queue_xmit(skb); rcu_read_unlock(); return len; out_unlock: rcu_read_unlock(); out_free: kfree_skb(skb); return err; } static unsigned int run_filter(struct sk_buff *skb, const struct sock *sk, unsigned int res) { struct sk_filter *filter; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) res = bpf_prog_run_clear_cb(filter->prog, skb); rcu_read_unlock(); return res; } static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, size_t *len, int vnet_hdr_sz) { struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 }; if (*len < vnet_hdr_sz) return -EINVAL; *len -= vnet_hdr_sz; if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0)) return -EINVAL; return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz); } /* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. * * Note tricky part: we DO mangle shared skb! skb->data, skb->len * and skb->cb are mangled. It works because (and until) packets * falling here are owned by current CPU. Output packets are cloned * by dev_queue_xmit_nit(), input packets are processed by net_bh * sequentially, so that if we return skb to original state on exit, * we will not harm anyone. */ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason = SKB_CONSUMED; struct sock *sk = NULL; struct sockaddr_ll *sll; struct packet_sock *po; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; skb->dev = dev; if (dev_has_header(dev)) { /* The device has an explicit notion of ll header, * exported to higher levels. * * Otherwise, the device hides details of its frame * structure, so that corresponding packet head is * never delivered to user. */ if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; if (snaplen > res) snaplen = res; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto drop_n_acct; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); if (nskb == NULL) goto drop_n_acct; if (skb_head != skb->data) { skb->data = skb_head; skb->len = skb_len; } consume_skb(skb); skb = nskb; } sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8); sll = &PACKET_SKB_CB(skb)->sa.ll; sll->sll_hatype = dev->type; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; sll->sll_halen = dev_parse_header(skb, sll->sll_addr); /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg(). * Use their space for storing the original skb length. */ PACKET_SKB_CB(skb)->sa.origlen = skb->len; if (pskb_trim(skb, snaplen)) goto drop_n_acct; skb_set_owner_r(skb, sk); skb->dev = NULL; skb_dst_drop(skb); /* drop conntrack reference */ nf_reset_ct(skb); spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; sock_skb_set_dropcount(sk, skb); skb_clear_delivery_time(skb); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); return 0; drop_n_acct: atomic_inc(&po->tp_drops); atomic_inc(&sk->sk_drops); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: sk_skb_reason_drop(sk, skb, drop_reason); return 0; } static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason = SKB_CONSUMED; struct sock *sk = NULL; struct packet_sock *po; struct sockaddr_ll *sll; union tpacket_uhdr h; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; unsigned long status = TP_STATUS_USER; unsigned short macoff, hdrlen; unsigned int netoff; struct sk_buff *copy_skb = NULL; struct timespec64 ts; __u32 ts_status; unsigned int slot_id = 0; int vnet_hdr_sz = 0; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing * userspace to call getsockopt(..., PACKET_HDRLEN, ...). */ BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32); BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48); if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; if (dev_has_header(dev)) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; /* If we are flooded, just give up */ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) { atomic_inc(&po->tp_drops); goto drop_n_restore; } if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) status |= TP_STATUS_GSO_TCP; if (snaplen > res) snaplen = res; if (sk->sk_type == SOCK_DGRAM) { macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + po->tp_reserve; } else { unsigned int maclen = skb_network_offset(skb); netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + po->tp_reserve; vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); if (vnet_hdr_sz) netoff += vnet_hdr_sz; macoff = netoff - maclen; } if (netoff > USHRT_MAX) { atomic_inc(&po->tp_drops); goto drop_n_restore; } if (po->tp_version <= TPACKET_V2) { if (macoff + snaplen > po->rx_ring.frame_size) { if (READ_ONCE(po->copy_thresh) && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { if (skb_shared(skb)) { copy_skb = skb_clone(skb, GFP_ATOMIC); } else { copy_skb = skb_get(skb); skb_head = skb->data; } if (copy_skb) { memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0, sizeof(PACKET_SKB_CB(copy_skb)->sa.ll)); skb_set_owner_r(copy_skb, sk); } } snaplen = po->rx_ring.frame_size - macoff; if ((int)snaplen < 0) { snaplen = 0; vnet_hdr_sz = 0; } } } else if (unlikely(macoff + snaplen > GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { u32 nval; nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff; pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n", snaplen, nval, macoff); snaplen = nval; if (unlikely((int)snaplen < 0)) { snaplen = 0; macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; vnet_hdr_sz = 0; } } spin_lock(&sk->sk_receive_queue.lock); h.raw = packet_current_rx_frame(po, skb, TP_STATUS_KERNEL, (macoff+snaplen)); if (!h.raw) goto drop_n_account; if (po->tp_version <= TPACKET_V2) { slot_id = po->rx_ring.head; if (test_bit(slot_id, po->rx_ring.rx_owner_map)) goto drop_n_account; __set_bit(slot_id, po->rx_ring.rx_owner_map); } if (vnet_hdr_sz && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), vio_le(), true, 0)) { if (po->tp_version == TPACKET_V3) prb_clear_blk_fill_status(&po->rx_ring); goto drop_n_account; } if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); /* * LOSING will be reported till you read the stats, * because it's COR - Clear On Read. * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; } po->stats.stats1.tp_packets++; if (copy_skb) { status |= TP_STATUS_COPY; skb_clear_delivery_time(copy_skb); __skb_queue_tail(&sk->sk_receive_queue, copy_skb); } spin_unlock(&sk->sk_receive_queue.lock); skb_copy_bits(skb, 0, h.raw + macoff, snaplen); /* Always timestamp; prefer an existing software timestamp taken * closer to the time of capture. */ ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp) | SOF_TIMESTAMPING_SOFTWARE); if (!ts_status) ktime_get_real_ts64(&ts); status |= ts_status; switch (po->tp_version) { case TPACKET_V1: h.h1->tp_len = skb->len; h.h1->tp_snaplen = snaplen; h.h1->tp_mac = macoff; h.h1->tp_net = netoff; h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; hdrlen = sizeof(*h.h1); break; case TPACKET_V2: h.h2->tp_len = skb->len; h.h2->tp_snaplen = snaplen; h.h2->tp_mac = macoff; h.h2->tp_net = netoff; h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; if (skb_vlan_tag_present(skb)) { h.h2->tp_vlan_tci = skb_vlan_tag_get(skb); h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(sk->sk_type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) { h.h2->tp_vlan_tci = vlan_get_tci(skb, skb->dev); h.h2->tp_vlan_tpid = ntohs(skb->protocol); status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { h.h2->tp_vlan_tci = 0; h.h2->tp_vlan_tpid = 0; } memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding)); hdrlen = sizeof(*h.h2); break; case TPACKET_V3: /* tp_nxt_offset,vlan are already populated above. * So DONT clear those fields here */ h.h3->tp_status |= status; h.h3->tp_len = skb->len; h.h3->tp_snaplen = snaplen; h.h3->tp_mac = macoff; h.h3->tp_net = netoff; h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); hdrlen = sizeof(*h.h3); break; default: BUG(); } sll = h.raw + TPACKET_ALIGN(hdrlen); sll->sll_halen = dev_parse_header(skb, sll->sll_addr); sll->sll_family = AF_PACKET; sll->sll_hatype = dev->type; sll->sll_protocol = (sk->sk_type == SOCK_DGRAM) ? vlan_get_protocol_dgram(skb) : skb->protocol; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; smp_mb(); #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 if (po->tp_version <= TPACKET_V2) { u8 *start, *end; end = (u8 *) PAGE_ALIGN((unsigned long) h.raw + macoff + snaplen); for (start = h.raw; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); } smp_wmb(); #endif if (po->tp_version <= TPACKET_V2) { spin_lock(&sk->sk_receive_queue.lock); __packet_set_status(po, h.raw, status); __clear_bit(slot_id, po->rx_ring.rx_owner_map); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); } else if (po->tp_version == TPACKET_V3) { prb_clear_blk_fill_status(&po->rx_ring); } drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: sk_skb_reason_drop(sk, skb, drop_reason); return 0; drop_n_account: spin_unlock(&sk->sk_receive_queue.lock); atomic_inc(&po->tp_drops); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; sk->sk_data_ready(sk); sk_skb_reason_drop(sk, copy_skb, drop_reason); goto drop_n_restore; } static void tpacket_destruct_skb(struct sk_buff *skb) { struct packet_sock *po = pkt_sk(skb->sk); if (likely(po->tx_ring.pg_vec)) { void *ph; __u32 ts; ph = skb_zcopy_get_nouarg(skb); packet_dec_pending(&po->tx_ring); ts = __packet_set_timestamp(po, ph, skb); __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); complete(&po->skb_completion); } sock_wfree(skb); } static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) { if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 > __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len))) vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(), __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2); if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len) return -EINVAL; return 0; } static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz) { int ret; if (*len < vnet_hdr_sz) return -EINVAL; *len -= vnet_hdr_sz; if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) return -EFAULT; ret = __packet_snd_vnet_parse(vnet_hdr, *len); if (ret) return ret; /* move iter to point to the start of mac header */ if (vnet_hdr_sz != sizeof(struct virtio_net_hdr)) iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr)); return 0; } static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, void *data, int tp_len, __be16 proto, unsigned char *addr, int hlen, int copylen, const struct sockcm_cookie *sockc) { union tpacket_uhdr ph; int to_write, offset, len, nr_frags, len_max; struct socket *sock = po->sk.sk_socket; struct page *page; int err; ph.raw = frame; skb->protocol = proto; skb->dev = dev; skb->priority = READ_ONCE(po->sk.sk_priority); skb->mark = READ_ONCE(po->sk.sk_mark); skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); skb_setup_tx_timestamp(skb, sockc); skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); skb_reset_network_header(skb); to_write = tp_len; if (sock->type == SOCK_DGRAM) { err = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, tp_len); if (unlikely(err < 0)) return -EINVAL; } else if (copylen) { int hdrlen = min_t(int, copylen, tp_len); skb_push(skb, dev->hard_header_len); skb_put(skb, copylen - dev->hard_header_len); err = skb_store_bits(skb, 0, data, hdrlen); if (unlikely(err)) return err; if (!dev_validate_header(dev, skb->data, hdrlen)) return -EINVAL; data += hdrlen; to_write -= hdrlen; } offset = offset_in_page(data); len_max = PAGE_SIZE - offset; len = ((to_write > len_max) ? len_max : to_write); skb->data_len = to_write; skb->len += to_write; skb->truesize += to_write; refcount_add(to_write, &po->sk.sk_wmem_alloc); while (likely(to_write)) { nr_frags = skb_shinfo(skb)->nr_frags; if (unlikely(nr_frags >= MAX_SKB_FRAGS)) { pr_err("Packet exceed the number of skb frags(%u)\n", (unsigned int)MAX_SKB_FRAGS); return -EFAULT; } page = pgv_to_page(data); data += len; flush_dcache_page(page); get_page(page); skb_fill_page_desc(skb, nr_frags, page, offset, len); to_write -= len; offset = 0; len_max = PAGE_SIZE; len = ((to_write > len_max) ? len_max : to_write); } packet_parse_headers(skb, sock); return tp_len; } static int tpacket_parse_header(struct packet_sock *po, void *frame, int size_max, void **data) { union tpacket_uhdr ph; int tp_len, off; ph.raw = frame; switch (po->tp_version) { case TPACKET_V3: if (ph.h3->tp_next_offset != 0) { pr_warn_once("variable sized slot not supported"); return -EINVAL; } tp_len = ph.h3->tp_len; break; case TPACKET_V2: tp_len = ph.h2->tp_len; break; default: tp_len = ph.h1->tp_len; break; } if (unlikely(tp_len > size_max)) { pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); return -EMSGSIZE; } if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) { int off_min, off_max; off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); off_max = po->tx_ring.frame_size - tp_len; if (po->sk.sk_type == SOCK_DGRAM) { switch (po->tp_version) { case TPACKET_V3: off = ph.h3->tp_net; break; case TPACKET_V2: off = ph.h2->tp_net; break; default: off = ph.h1->tp_net; break; } } else { switch (po->tp_version) { case TPACKET_V3: off = ph.h3->tp_mac; break; case TPACKET_V2: off = ph.h2->tp_mac; break; default: off = ph.h1->tp_mac; break; } } if (unlikely((off < off_min) || (off_max < off))) return -EINVAL; } else { off = po->tp_hdrlen - sizeof(struct sockaddr_ll); } *data = frame + off; return tp_len; } static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) { struct sk_buff *skb = NULL; struct net_device *dev; struct virtio_net_hdr *vnet_hdr = NULL; struct sockcm_cookie sockc; __be16 proto; int err, reserve = 0; void *ph; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); unsigned char *addr = NULL; int tp_len, size_max; void *data; int len_sum = 0; int status = TP_STATUS_AVAILABLE; int hlen, tlen, copylen = 0; long timeo = 0; mutex_lock(&po->pg_vec_lock); /* packet_sendmsg() check on tx_ring.pg_vec was lockless, * we need to confirm it under protection of pg_vec_lock. */ if (unlikely(!po->tx_ring.pg_vec)) { err = -EBUSY; goto out; } if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); if (po->sk.sk_socket->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len + offsetof(struct sockaddr_ll, sll_addr)) goto out_put; addr = saddr->sll_addr; } } err = -ENXIO; if (unlikely(dev == NULL)) goto out; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_put; sockcm_init(&sockc, &po->sk); if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); if (unlikely(err)) goto out_put; } if (po->sk.sk_socket->type == SOCK_RAW) reserve = dev->hard_header_len; size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) size_max = dev->mtu + reserve + VLAN_HLEN; reinit_completion(&po->skb_completion); do { ph = packet_current_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST); if (unlikely(ph == NULL)) { if (need_wait && skb) { timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT); timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo); if (timeo <= 0) { err = !timeo ? -ETIMEDOUT : -ERESTARTSYS; goto out_put; } } /* check for additional frames */ continue; } skb = NULL; tp_len = tpacket_parse_header(po, ph, size_max, &data); if (tp_len < 0) goto tpacket_error; status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; if (vnet_hdr_sz) { vnet_hdr = data; data += vnet_hdr_sz; tp_len -= vnet_hdr_sz; if (tp_len < 0 || __packet_snd_vnet_parse(vnet_hdr, tp_len)) { tp_len = -EINVAL; goto tpacket_error; } copylen = __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len); } copylen = max_t(int, copylen, dev->hard_header_len); skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll) + (copylen - dev->hard_header_len), !need_wait, &err); if (unlikely(skb == NULL)) { /* we assume the socket was initially writeable ... */ if (likely(len_sum > 0)) err = len_sum; goto out_status; } tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto, addr, hlen, copylen, &sockc); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && !vnet_hdr_sz && !packet_extra_vlan_len_allowed(dev, skb)) tp_len = -EMSGSIZE; if (unlikely(tp_len < 0)) { tpacket_error: if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) { __packet_set_status(po, ph, TP_STATUS_AVAILABLE); packet_increment_head(&po->tx_ring); kfree_skb(skb); continue; } else { status = TP_STATUS_WRONG_FORMAT; err = tp_len; goto out_status; } } if (vnet_hdr_sz) { if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { tp_len = -EINVAL; goto tpacket_error; } virtio_net_hdr_set_proto(skb, vnet_hdr); } skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); packet_inc_pending(&po->tx_ring); status = TP_STATUS_SEND_REQUEST; err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); if (err && __packet_get_status(po, ph) == TP_STATUS_AVAILABLE) { /* skb was destructed already */ skb = NULL; goto out_status; } /* * skb was dropped but not destructed yet; * let's treat it like congestion or err < 0 */ err = 0; } packet_increment_head(&po->tx_ring); len_sum += tp_len; } while (likely((ph != NULL) || /* Note: packet_read_pending() might be slow if we have * to call it as it's per_cpu variable, but in fast-path * we already short-circuit the loop with the first * condition, and luckily don't have to go that path * anyway. */ (need_wait && packet_read_pending(&po->tx_ring)))); err = len_sum; goto out_put; out_status: __packet_set_status(po, ph, status); kfree_skb(skb); out_put: dev_put(dev); out: mutex_unlock(&po->pg_vec_lock); return err; } static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, size_t reserve, size_t len, size_t linear, int noblock, int *err) { struct sk_buff *skb; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE || !linear) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; skb_reserve(skb, reserve); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); struct sk_buff *skb; struct net_device *dev; __be16 proto; unsigned char *addr = NULL; int err, reserve = 0; struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); int hlen, tlen, linear; int extra_len = 0; /* * Get and verify the address. */ if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); if (sock->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len + offsetof(struct sockaddr_ll, sll_addr)) goto out_unlock; addr = saddr->sll_addr; } } err = -ENXIO; if (unlikely(dev == NULL)) goto out_unlock; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; sockcm_init(&sockc, sk); sockc.mark = READ_ONCE(sk->sk_mark); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto out_unlock; } if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; if (vnet_hdr_sz) { err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz); if (err) goto out_unlock; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) goto out_unlock; err = -ENOBUFS; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len); linear = max(linear, min_t(int, len, dev->hard_header_len)); skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; skb_reset_network_header(skb); err = -EINVAL; if (sock->type == SOCK_DGRAM) { offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len); if (unlikely(offset < 0)) goto out_free; } else if (reserve) { skb_reserve(skb, -reserve); if (len < reserve + sizeof(struct ipv6hdr) && dev->min_header_len != dev->hard_header_len) skb_reset_network_header(skb); } /* Returns -EFAULT on error */ err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len); if (err) goto out_free; if ((sock->type == SOCK_RAW && !dev_validate_header(dev, skb->data, len)) || !skb->len) { err = -EINVAL; goto out_free; } skb_setup_tx_timestamp(skb, &sockc); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_free; } skb->protocol = proto; skb->dev = dev; skb->priority = READ_ONCE(sk->sk_priority); skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); if (vnet_hdr_sz) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; len += vnet_hdr_sz; virtio_net_hdr_set_proto(skb, &vnet_hdr); } err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); if (err) goto out_unlock; } dev_put(dev); return len; out_free: kfree_skb(skb); out_unlock: dev_put(dev); out: return err; } static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy. * tpacket_snd() will redo the check safely. */ if (data_race(po->tx_ring.pg_vec)) return tpacket_snd(po, msg); return packet_snd(sock, msg, len); } /* * Close a PACKET socket. This is fairly simple. We immediately go * to 'closed' state and remove our protocol entry in the device list. */ static int packet_release(struct socket *sock) { struct sock *sk = sock->sk; struct packet_sock *po; struct packet_fanout *f; struct net *net; union tpacket_req_u req_u; if (!sk) return 0; net = sock_net(sk); po = pkt_sk(sk); mutex_lock(&net->packet.sklist_lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->packet.sklist_lock); sock_prot_inuse_add(net, sk->sk_prot, -1); spin_lock(&po->bind_lock); unregister_prot_hook(sk, false); packet_cached_dev_reset(po); if (po->prot_hook.dev) { netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); packet_flush_mclist(sk); lock_sock(sk); if (po->rx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 0); } if (po->tx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 1); } release_sock(sk); f = fanout_release(sk); synchronize_net(); kfree(po->rollover); if (f) { fanout_release_data(f); kvfree(f); } /* * Now the socket is dead. No more input will appear. */ sock_orphan(sk); sock->sk = NULL; /* Purge queues */ skb_queue_purge(&sk->sk_receive_queue); packet_free_pending(po); sock_put(sk); return 0; } /* * Attach a packet hook. */ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, __be16 proto) { struct packet_sock *po = pkt_sk(sk); struct net_device *dev = NULL; bool unlisted = false; bool need_rehook; int ret = 0; lock_sock(sk); spin_lock(&po->bind_lock); if (!proto) proto = po->num; rcu_read_lock(); if (po->fanout) { ret = -EINVAL; goto out_unlock; } if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { ret = -ENODEV; goto out_unlock; } } else if (ifindex) { dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (!dev) { ret = -ENODEV; goto out_unlock; } } need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev; if (need_rehook) { dev_hold(dev); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { rcu_read_unlock(); /* prevents packet_notifier() from calling * register_prot_hook() */ WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, true); rcu_read_lock(); if (dev) unlisted = !dev_get_by_index_rcu(sock_net(sk), dev->ifindex); } BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING)); WRITE_ONCE(po->num, proto); po->prot_hook.type = proto; netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); if (unlikely(unlisted)) { po->prot_hook.dev = NULL; WRITE_ONCE(po->ifindex, -1); packet_cached_dev_reset(po); } else { netdev_hold(dev, &po->prot_hook.dev_tracker, GFP_ATOMIC); po->prot_hook.dev = dev; WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0); packet_cached_dev_assign(po, dev); } dev_put(dev); } if (proto == 0 || !need_rehook) goto out_unlock; if (!unlisted && (!dev || (dev->flags & IFF_UP))) { register_prot_hook(sk); } else { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } out_unlock: rcu_read_unlock(); spin_unlock(&po->bind_lock); release_sock(sk); return ret; } /* * Bind a packet socket to a device */ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; char name[sizeof(uaddr->sa_data_min) + 1]; /* * Check legality */ if (addr_len != sizeof(struct sockaddr)) return -EINVAL; /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); name[sizeof(uaddr->sa_data_min)] = 0; return packet_do_bind(sk, name, 0, 0); } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; /* * Check legality */ if (addr_len < sizeof(struct sockaddr_ll)) return -EINVAL; if (sll->sll_family != AF_PACKET) return -EINVAL; return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol); } static struct proto packet_proto = { .name = "PACKET", .owner = THIS_MODULE, .obj_size = sizeof(struct packet_sock), }; /* * Create a packet of type SOCK_PACKET. */ static int packet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct packet_sock *po; __be16 proto = (__force __be16)protocol; /* weird, but documented */ int err; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type != SOCK_PACKET) return -ESOCKTNOSUPPORT; sock->state = SS_UNCONNECTED; err = -ENOBUFS; sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern); if (sk == NULL) goto out; sock->ops = &packet_ops; if (sock->type == SOCK_PACKET) sock->ops = &packet_ops_spkt; po = pkt_sk(sk); err = packet_alloc_pending(po); if (err) goto out_sk_free; sock_init_data(sock, sk); init_completion(&po->skb_completion); sk->sk_family = PF_PACKET; po->num = proto; packet_cached_dev_reset(po); sk->sk_destruct = packet_sock_destruct; /* * Attach a protocol block */ spin_lock_init(&po->bind_lock); mutex_init(&po->pg_vec_lock); po->rollover = NULL; po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; po->prot_hook.af_packet_net = sock_net(sk); if (proto) { po->prot_hook.type = proto; __register_prot_hook(sk); } mutex_lock(&net->packet.sklist_lock); sk_add_node_tail_rcu(sk, &net->packet.sklist); mutex_unlock(&net->packet.sklist_lock); sock_prot_inuse_add(net, &packet_proto, 1); return 0; out_sk_free: sk_free(sk); out: return err; } /* * Pull a packet from our receive queue and hand it to the user. * If necessary we block. */ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz); unsigned int origlen = 0; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE)) goto out; #if 0 /* What error should we return now? EUNATTACH? */ if (pkt_sk(sk)->ifindex < 0) return -ENODEV; #endif if (flags & MSG_ERRQUEUE) { err = sock_recv_errqueue(sk, msg, len, SOL_PACKET, PACKET_TX_TIMESTAMP); goto out; } /* * Call the generic datagram receiver. This handles all sorts * of horrible races and re-entrancy so we can forget about it * in the protocol layers. * * Now it will return ENETDOWN, if device have just gone down, * but then it will block. */ skb = skb_recv_datagram(sk, flags, &err); /* * An error occurred so return it. Because skb_recv_datagram() * handles the blocking we don't see and worry about blocking * retries. */ if (skb == NULL) goto out; packet_rcv_try_clear_pressure(pkt_sk(sk)); if (vnet_hdr_len) { err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len); if (err) goto out_free; } /* You lose any data beyond the buffer you gave. If it worries * a user program they can ask the device for its MTU * anyway. */ copied = skb->len; if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; if (sock->type != SOCK_PACKET) { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; /* Original length was stored in sockaddr_ll fields */ origlen = PACKET_SKB_CB(skb)->sa.origlen; sll->sll_family = AF_PACKET; sll->sll_protocol = (sock->type == SOCK_DGRAM) ? vlan_get_protocol_dgram(skb) : skb->protocol; } sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { const size_t max_len = min(sizeof(skb->cb), sizeof(struct sockaddr_storage)); int copy_len; /* If the address length field is there to be filled * in, we fill it in now. */ if (sock->type == SOCK_PACKET) { __sockaddr_check_size(sizeof(struct sockaddr_pkt)); msg->msg_namelen = sizeof(struct sockaddr_pkt); copy_len = msg->msg_namelen; } else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); copy_len = msg->msg_namelen; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) { memset(msg->msg_name + offsetof(struct sockaddr_ll, sll_addr), 0, sizeof(sll->sll_addr)); msg->msg_namelen = sizeof(struct sockaddr_ll); } } if (WARN_ON_ONCE(copy_len > max_len)) { copy_len = max_len; msg->msg_namelen = copy_len; } memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) { struct tpacket_auxdata aux; aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) aux.tp_status |= TP_STATUS_GSO_TCP; aux.tp_len = origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; aux.tp_net = skb_network_offset(skb); if (skb_vlan_tag_present(skb)) { aux.tp_vlan_tci = skb_vlan_tag_get(skb); aux.tp_vlan_tpid = ntohs(skb->vlan_proto); aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(sock->type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), sll->sll_ifindex); if (dev) { aux.tp_vlan_tci = vlan_get_tci(skb, dev); aux.tp_vlan_tpid = ntohs(skb->protocol); aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { aux.tp_vlan_tci = 0; aux.tp_vlan_tpid = 0; } rcu_read_unlock(); } else { aux.tp_vlan_tci = 0; aux.tp_vlan_tpid = 0; } put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); } /* * Free or return the buffer as appropriate. Again this * hides all the races and re-entrancy issues from us. */ err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); out_free: skb_free_datagram(sk, skb); out: return err; } static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, int peer) { struct net_device *dev; struct sock *sk = sock->sk; if (peer) return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min)); rcu_read_unlock(); return sizeof(*uaddr); } static int packet_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct net_device *dev; struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); int ifindex; if (peer) return -EOPNOTSUPP; ifindex = READ_ONCE(po->ifindex); sll->sll_family = AF_PACKET; sll->sll_ifindex = ifindex; sll->sll_protocol = READ_ONCE(po->num); sll->sll_pkttype = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; /* Let __fortify_memcpy_chk() know the actual buffer size. */ memcpy(((struct sockaddr_storage *)sll)->__data + offsetof(struct sockaddr_ll, sll_addr) - offsetofend(struct sockaddr_ll, sll_family), dev->dev_addr, dev->addr_len); } else { sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ sll->sll_halen = 0; } rcu_read_unlock(); return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; } static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what) { switch (i->type) { case PACKET_MR_MULTICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_mc_add(dev, i->addr); else return dev_mc_del(dev, i->addr); break; case PACKET_MR_PROMISC: return dev_set_promiscuity(dev, what); case PACKET_MR_ALLMULTI: return dev_set_allmulti(dev, what); case PACKET_MR_UNICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_uc_add(dev, i->addr); else return dev_uc_del(dev, i->addr); break; default: break; } return 0; } static void packet_dev_mclist_delete(struct net_device *dev, struct packet_mclist **mlp) { struct packet_mclist *ml; while ((ml = *mlp) != NULL) { if (ml->ifindex == dev->ifindex) { packet_dev_mc(dev, ml, -1); *mlp = ml->next; kfree(ml); } else mlp = &ml->next; } } static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml, *i; struct net_device *dev; int err; rtnl_lock(); err = -ENODEV; dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex); if (!dev) goto done; err = -EINVAL; if (mreq->mr_alen > dev->addr_len) goto done; err = -ENOBUFS; i = kmalloc(sizeof(*i), GFP_KERNEL); if (i == NULL) goto done; err = 0; for (ml = po->mclist; ml; ml = ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { ml->count++; /* Free the new element ... */ kfree(i); goto done; } } i->type = mreq->mr_type; i->ifindex = mreq->mr_ifindex; i->alen = mreq->mr_alen; memcpy(i->addr, mreq->mr_address, i->alen); memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); i->count = 1; i->next = po->mclist; po->mclist = i; err = packet_dev_mc(dev, i, 1); if (err) { po->mclist = i->next; kfree(i); } done: rtnl_unlock(); return err; } static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_mclist *ml, **mlp; rtnl_lock(); for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { if (--ml->count == 0) { struct net_device *dev; *mlp = ml->next; dev = __dev_get_by_index(sock_net(sk), ml->ifindex); if (dev) packet_dev_mc(dev, ml, -1); kfree(ml); } break; } } rtnl_unlock(); return 0; } static void packet_flush_mclist(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml; if (!po->mclist) return; rtnl_lock(); while ((ml = po->mclist) != NULL) { struct net_device *dev; po->mclist = ml->next; dev = __dev_get_by_index(sock_net(sk), ml->ifindex); if (dev != NULL) packet_dev_mc(dev, ml, -1); kfree(ml); } rtnl_unlock(); } static int packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); int ret; if (level != SOL_PACKET) return -ENOPROTOOPT; switch (optname) { case PACKET_ADD_MEMBERSHIP: case PACKET_DROP_MEMBERSHIP: { struct packet_mreq_max mreq; int len = optlen; memset(&mreq, 0, sizeof(mreq)); if (len < sizeof(struct packet_mreq)) return -EINVAL; if (len > sizeof(mreq)) len = sizeof(mreq); if (copy_from_sockptr(&mreq, optval, len)) return -EFAULT; if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address))) return -EINVAL; if (optname == PACKET_ADD_MEMBERSHIP) ret = packet_mc_add(sk, &mreq); else ret = packet_mc_drop(sk, &mreq); return ret; } case PACKET_RX_RING: case PACKET_TX_RING: { union tpacket_req_u req_u; ret = -EINVAL; lock_sock(sk); switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: if (optlen < sizeof(req_u.req)) break; ret = copy_from_sockptr(&req_u.req, optval, sizeof(req_u.req)) ? -EINVAL : 0; break; case TPACKET_V3: default: if (optlen < sizeof(req_u.req3)) break; ret = copy_from_sockptr(&req_u.req3, optval, sizeof(req_u.req3)) ? -EINVAL : 0; break; } if (!ret) ret = packet_set_ring(sk, &req_u, 0, optname == PACKET_TX_RING); release_sock(sk); return ret; } case PACKET_COPY_THRESH: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; WRITE_ONCE(pkt_sk(sk)->copy_thresh, val); return 0; } case PACKET_VERSION: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (val) { case TPACKET_V1: case TPACKET_V2: case TPACKET_V3: break; default: return -EINVAL; } lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { po->tp_version = val; ret = 0; } release_sock(sk); return ret; } case PACKET_RESERVE: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val > INT_MAX) return -EINVAL; lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { po->tp_reserve = val; ret = 0; } release_sock(sk); return ret; } case PACKET_LOSS: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { packet_sock_flag_set(po, PACKET_SOCK_TP_LOSS, val); ret = 0; } release_sock(sk); return ret; } case PACKET_AUXDATA: { int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val); return 0; } case PACKET_ORIGDEV: { int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val); return 0; } case PACKET_VNET_HDR: case PACKET_VNET_HDR_SZ: { int val, hdr_len; if (sock->type != SOCK_RAW) return -EINVAL; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (optname == PACKET_VNET_HDR_SZ) { if (val && val != sizeof(struct virtio_net_hdr) && val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) return -EINVAL; hdr_len = val; } else { hdr_len = val ? sizeof(struct virtio_net_hdr) : 0; } lock_sock(sk); if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { ret = -EBUSY; } else { WRITE_ONCE(po->vnet_hdr_sz, hdr_len); ret = 0; } release_sock(sk); return ret; } case PACKET_TIMESTAMP: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; WRITE_ONCE(po->tp_tstamp, val); return 0; } case PACKET_FANOUT: { struct fanout_args args = { 0 }; if (optlen != sizeof(int) && optlen != sizeof(args)) return -EINVAL; if (copy_from_sockptr(&args, optval, optlen)) return -EFAULT; return fanout_add(sk, &args); } case PACKET_FANOUT_DATA: { /* Paired with the WRITE_ONCE() in fanout_add() */ if (!READ_ONCE(po->fanout)) return -EINVAL; return fanout_set_data(po, optval, optlen); } case PACKET_IGNORE_OUTGOING: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; if (val < 0 || val > 1) return -EINVAL; WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val); return 0; } case PACKET_TX_HAS_OFF: { unsigned int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; lock_sock(sk); if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec) packet_sock_flag_set(po, PACKET_SOCK_TX_HAS_OFF, val); release_sock(sk); return 0; } case PACKET_QDISC_BYPASS: { int val; if (optlen != sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; packet_sock_flag_set(po, PACKET_SOCK_QDISC_BYPASS, val); return 0; } default: return -ENOPROTOOPT; } } static int packet_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { int len; int val, lv = sizeof(val); struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); void *data = &val; union tpacket_stats_u st; struct tpacket_rollover_stats rstats; int drops; if (level != SOL_PACKET) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case PACKET_STATISTICS: spin_lock_bh(&sk->sk_receive_queue.lock); memcpy(&st, &po->stats, sizeof(st)); memset(&po->stats, 0, sizeof(po->stats)); spin_unlock_bh(&sk->sk_receive_queue.lock); drops = atomic_xchg(&po->tp_drops, 0); if (po->tp_version == TPACKET_V3) { lv = sizeof(struct tpacket_stats_v3); st.stats3.tp_drops = drops; st.stats3.tp_packets += drops; data = &st.stats3; } else { lv = sizeof(struct tpacket_stats); st.stats1.tp_drops = drops; st.stats1.tp_packets += drops; data = &st.stats1; } break; case PACKET_AUXDATA: val = packet_sock_flag(po, PACKET_SOCK_AUXDATA); break; case PACKET_ORIGDEV: val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV); break; case PACKET_VNET_HDR: val = !!READ_ONCE(po->vnet_hdr_sz); break; case PACKET_VNET_HDR_SZ: val = READ_ONCE(po->vnet_hdr_sz); break; case PACKET_COPY_THRESH: val = READ_ONCE(pkt_sk(sk)->copy_thresh); break; case PACKET_VERSION: val = po->tp_version; break; case PACKET_HDRLEN: if (len > sizeof(int)) len = sizeof(int); if (len < sizeof(int)) return -EINVAL; if (copy_from_user(&val, optval, len)) return -EFAULT; switch (val) { case TPACKET_V1: val = sizeof(struct tpacket_hdr); break; case TPACKET_V2: val = sizeof(struct tpacket2_hdr); break; case TPACKET_V3: val = sizeof(struct tpacket3_hdr); break; default: return -EINVAL; } break; case PACKET_RESERVE: val = po->tp_reserve; break; case PACKET_LOSS: val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS); break; case PACKET_TIMESTAMP: val = READ_ONCE(po->tp_tstamp); break; case PACKET_FANOUT: val = (po->fanout ? ((u32)po->fanout->id | ((u32)po->fanout->type << 16) | ((u32)po->fanout->flags << 24)) : 0); break; case PACKET_IGNORE_OUTGOING: val = READ_ONCE(po->prot_hook.ignore_outgoing); break; case PACKET_ROLLOVER_STATS: if (!po->rollover) return -EINVAL; rstats.tp_all = atomic_long_read(&po->rollover->num); rstats.tp_huge = atomic_long_read(&po->rollover->num_huge); rstats.tp_failed = atomic_long_read(&po->rollover->num_failed); data = &rstats; lv = sizeof(rstats); break; case PACKET_TX_HAS_OFF: val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF); break; case PACKET_QDISC_BYPASS: val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS); break; default: return -ENOPROTOOPT; } if (len > lv) len = lv; if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, data, len)) return -EFAULT; return 0; } static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { struct sock *sk; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); rcu_read_lock(); sk_for_each_rcu(sk, &net->packet.sklist) { struct packet_sock *po = pkt_sk(sk); switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) packet_dev_mclist_delete(dev, &po->mclist); fallthrough; case NETDEV_DOWN: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { __unregister_prot_hook(sk, false); sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); WRITE_ONCE(po->ifindex, -1); netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); } break; case NETDEV_UP: if (dev->ifindex == po->ifindex) { spin_lock(&po->bind_lock); if (po->num) register_prot_hook(sk); spin_unlock(&po->bind_lock); } break; } } rcu_read_unlock(); return NOTIFY_DONE; } static int packet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; switch (cmd) { case SIOCOUTQ: { int amount = sk_wmem_alloc_get(sk); return put_user(amount, (int __user *)arg); } case SIOCINQ: { struct sk_buff *skb; int amount = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) amount = skb->len; spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } #ifdef CONFIG_INET case SIOCADDRT: case SIOCDELRT: case SIOCDARP: case SIOCGARP: case SIOCSARP: case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCSIFFLAGS: return inet_dgram_ops.ioctl(sock, cmd, arg); #endif default: return -ENOIOCTLCMD; } return 0; } static __poll_t packet_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); __poll_t mask = datagram_poll(file, sock, wait); spin_lock_bh(&sk->sk_receive_queue.lock); if (po->rx_ring.pg_vec) { if (!packet_previous_rx_frame(po, &po->rx_ring, TP_STATUS_KERNEL)) mask |= EPOLLIN | EPOLLRDNORM; } packet_rcv_try_clear_pressure(po); spin_unlock_bh(&sk->sk_receive_queue.lock); spin_lock_bh(&sk->sk_write_queue.lock); if (po->tx_ring.pg_vec) { if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE)) mask |= EPOLLOUT | EPOLLWRNORM; } spin_unlock_bh(&sk->sk_write_queue.lock); return mask; } /* Dirty? Well, I still did not learn better way to account * for user mmaps. */ static void packet_mm_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct socket *sock = file->private_data; struct sock *sk = sock->sk; if (sk) atomic_long_inc(&pkt_sk(sk)->mapped); } static void packet_mm_close(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct socket *sock = file->private_data; struct sock *sk = sock->sk; if (sk) atomic_long_dec(&pkt_sk(sk)->mapped); } static const struct vm_operations_struct packet_mmap_ops = { .open = packet_mm_open, .close = packet_mm_close, }; static void free_pg_vec(struct pgv *pg_vec, unsigned int order, unsigned int len) { int i; for (i = 0; i < len; i++) { if (likely(pg_vec[i].buffer)) { if (is_vmalloc_addr(pg_vec[i].buffer)) vfree(pg_vec[i].buffer); else free_pages((unsigned long)pg_vec[i].buffer, order); pg_vec[i].buffer = NULL; } } kfree(pg_vec); } static char *alloc_one_pg_vec_page(unsigned long order) { char *buffer; gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY; buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; /* __get_free_pages failed, fall back to vmalloc */ buffer = vzalloc(array_size((1 << order), PAGE_SIZE)); if (buffer) return buffer; /* vmalloc failed, lets dig into swap here */ gfp_flags &= ~__GFP_NORETRY; buffer = (char *) __get_free_pages(gfp_flags, order); if (buffer) return buffer; /* complete and utter failure */ return NULL; } static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order) { unsigned int block_nr = req->tp_block_nr; struct pgv *pg_vec; int i; pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN); if (unlikely(!pg_vec)) goto out; for (i = 0; i < block_nr; i++) { pg_vec[i].buffer = alloc_one_pg_vec_page(order); if (unlikely(!pg_vec[i].buffer)) goto out_free_pgvec; } out: return pg_vec; out_free_pgvec: free_pg_vec(pg_vec, order, block_nr); pg_vec = NULL; goto out; } static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring) { struct pgv *pg_vec = NULL; struct packet_sock *po = pkt_sk(sk); unsigned long *rx_owner_map = NULL; int was_running, order = 0; struct packet_ring_buffer *rb; struct sk_buff_head *rb_queue; __be16 num; int err; /* Added to avoid minimal code churn */ struct tpacket_req *req = &req_u->req; rb = tx_ring ? &po->tx_ring : &po->rx_ring; rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; err = -EBUSY; if (!closing) { if (atomic_long_read(&po->mapped)) goto out; if (packet_read_pending(rb)) goto out; } if (req->tp_block_nr) { unsigned int min_frame_size; /* Sanity tests and some calculations */ err = -EBUSY; if (unlikely(rb->pg_vec)) goto out; switch (po->tp_version) { case TPACKET_V1: po->tp_hdrlen = TPACKET_HDRLEN; break; case TPACKET_V2: po->tp_hdrlen = TPACKET2_HDRLEN; break; case TPACKET_V3: po->tp_hdrlen = TPACKET3_HDRLEN; break; } err = -EINVAL; if (unlikely((int)req->tp_block_size <= 0)) goto out; if (unlikely(!PAGE_ALIGNED(req->tp_block_size))) goto out; min_frame_size = po->tp_hdrlen + po->tp_reserve; if (po->tp_version >= TPACKET_V3 && req->tp_block_size < BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size) goto out; if (unlikely(req->tp_frame_size < min_frame_size)) goto out; if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1))) goto out; rb->frames_per_block = req->tp_block_size / req->tp_frame_size; if (unlikely(rb->frames_per_block == 0)) goto out; if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr)) goto out; if (unlikely((rb->frames_per_block * req->tp_block_nr) != req->tp_frame_nr)) goto out; err = -ENOMEM; order = get_order(req->tp_block_size); pg_vec = alloc_pg_vec(req, order); if (unlikely(!pg_vec)) goto out; switch (po->tp_version) { case TPACKET_V3: /* Block transmit is not supported yet */ if (!tx_ring) { init_prb_bdqc(po, rb, pg_vec, req_u); } else { struct tpacket_req3 *req3 = &req_u->req3; if (req3->tp_retire_blk_tov || req3->tp_sizeof_priv || req3->tp_feature_req_word) { err = -EINVAL; goto out_free_pg_vec; } } break; default: if (!tx_ring) { rx_owner_map = bitmap_alloc(req->tp_frame_nr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); if (!rx_owner_map) goto out_free_pg_vec; } break; } } /* Done */ else { err = -EINVAL; if (unlikely(req->tp_frame_nr)) goto out; } /* Detach socket from network */ spin_lock(&po->bind_lock); was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING); num = po->num; if (was_running) { WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, false); } spin_unlock(&po->bind_lock); synchronize_net(); err = -EBUSY; mutex_lock(&po->pg_vec_lock); if (closing || atomic_long_read(&po->mapped) == 0) { err = 0; spin_lock_bh(&rb_queue->lock); swap(rb->pg_vec, pg_vec); if (po->tp_version <= TPACKET_V2) swap(rb->rx_owner_map, rx_owner_map); rb->frame_max = (req->tp_frame_nr - 1); rb->head = 0; rb->frame_size = req->tp_frame_size; spin_unlock_bh(&rb_queue->lock); swap(rb->pg_vec_order, order); swap(rb->pg_vec_len, req->tp_block_nr); rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE; po->prot_hook.func = (po->rx_ring.pg_vec) ? tpacket_rcv : packet_rcv; skb_queue_purge(rb_queue); if (atomic_long_read(&po->mapped)) pr_err("packet_mmap: vma is busy: %ld\n", atomic_long_read(&po->mapped)); } mutex_unlock(&po->pg_vec_lock); spin_lock(&po->bind_lock); if (was_running) { WRITE_ONCE(po->num, num); register_prot_hook(sk); } spin_unlock(&po->bind_lock); if (pg_vec && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ if (!tx_ring) prb_shutdown_retire_blk_timer(po, rb_queue); } out_free_pg_vec: if (pg_vec) { bitmap_free(rx_owner_map); free_pg_vec(pg_vec, order, req->tp_block_nr); } out: return err; } static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); unsigned long size, expected_size; struct packet_ring_buffer *rb; unsigned long start; int err = -EINVAL; int i; if (vma->vm_pgoff) return -EINVAL; mutex_lock(&po->pg_vec_lock); expected_size = 0; for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { if (rb->pg_vec) { expected_size += rb->pg_vec_len * rb->pg_vec_pages * PAGE_SIZE; } } if (expected_size == 0) goto out; size = vma->vm_end - vma->vm_start; if (size != expected_size) goto out; start = vma->vm_start; for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) { if (rb->pg_vec == NULL) continue; for (i = 0; i < rb->pg_vec_len; i++) { struct page *page; void *kaddr = rb->pg_vec[i].buffer; int pg_num; for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) { page = pgv_to_page(kaddr); err = vm_insert_page(vma, start, page); if (unlikely(err)) goto out; start += PAGE_SIZE; kaddr += PAGE_SIZE; } } } atomic_long_inc(&po->mapped); vma->vm_ops = &packet_mmap_ops; err = 0; out: mutex_unlock(&po->pg_vec_lock); return err; } static const struct proto_ops packet_ops_spkt = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind_spkt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname_spkt, .poll = datagram_poll, .ioctl = packet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = packet_sendmsg_spkt, .recvmsg = packet_recvmsg, .mmap = sock_no_mmap, }; static const struct proto_ops packet_ops = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, .bind = packet_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = packet_getname, .poll = packet_poll, .ioctl = packet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, .getsockopt = packet_getsockopt, .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, }; static const struct net_proto_family packet_family_ops = { .family = PF_PACKET, .create = packet_create, .owner = THIS_MODULE, }; static struct notifier_block packet_netdev_notifier = { .notifier_call = packet_notifier, }; #ifdef CONFIG_PROC_FS static void *packet_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); rcu_read_lock(); return seq_hlist_start_head_rcu(&net->packet.sklist, *pos); } static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct net *net = seq_file_net(seq); return seq_hlist_next_rcu(v, &net->packet.sklist, pos); } static void packet_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int packet_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, "%*sRefCnt Type Proto Iface R Rmem User Inode\n", IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk"); else { struct sock *s = sk_entry(v); const struct packet_sock *po = pkt_sk(s); seq_printf(seq, "%pK %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", s, refcount_read(&s->sk_refcnt), s->sk_type, ntohs(READ_ONCE(po->num)), READ_ONCE(po->ifindex), packet_sock_flag(po, PACKET_SOCK_RUNNING), atomic_read(&s->sk_rmem_alloc), from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), sock_i_ino(s)); } return 0; } static const struct seq_operations packet_seq_ops = { .start = packet_seq_start, .next = packet_seq_next, .stop = packet_seq_stop, .show = packet_seq_show, }; #endif static int __net_init packet_net_init(struct net *net) { mutex_init(&net->packet.sklist_lock); INIT_HLIST_HEAD(&net->packet.sklist); #ifdef CONFIG_PROC_FS if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; #endif /* CONFIG_PROC_FS */ return 0; } static void __net_exit packet_net_exit(struct net *net) { remove_proc_entry("packet", net->proc_net); WARN_ON_ONCE(!hlist_empty(&net->packet.sklist)); } static struct pernet_operations packet_net_ops = { .init = packet_net_init, .exit = packet_net_exit, }; static void __exit packet_exit(void) { sock_unregister(PF_PACKET); proto_unregister(&packet_proto); unregister_netdevice_notifier(&packet_netdev_notifier); unregister_pernet_subsys(&packet_net_ops); } static int __init packet_init(void) { int rc; rc = register_pernet_subsys(&packet_net_ops); if (rc) goto out; rc = register_netdevice_notifier(&packet_netdev_notifier); if (rc) goto out_pernet; rc = proto_register(&packet_proto, 0); if (rc) goto out_notifier; rc = sock_register(&packet_family_ops); if (rc) goto out_proto; return 0; out_proto: proto_unregister(&packet_proto); out_notifier: unregister_netdevice_notifier(&packet_netdev_notifier); out_pernet: unregister_pernet_subsys(&packet_net_ops); out: return rc; } module_init(packet_init); module_exit(packet_exit); MODULE_DESCRIPTION("Packet socket support (AF_PACKET)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_PACKET);
20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_BUFFER_H #define _LINUX_TTY_BUFFER_H #include <linux/atomic.h> #include <linux/llist.h> #include <linux/mutex.h> #include <linux/workqueue.h> struct tty_buffer { union { struct tty_buffer *next; struct llist_node free; }; unsigned int used; unsigned int size; unsigned int commit; unsigned int lookahead; /* Lazy update on recv, can become less than "read" */ unsigned int read; bool flags; /* Data points here */ u8 data[] __aligned(sizeof(unsigned long)); }; static inline u8 *char_buf_ptr(struct tty_buffer *b, unsigned int ofs) { return b->data + ofs; } static inline u8 *flag_buf_ptr(struct tty_buffer *b, unsigned int ofs) { return char_buf_ptr(b, ofs) + b->size; } struct tty_bufhead { struct tty_buffer *head; /* Queue head */ struct work_struct work; struct mutex lock; atomic_t priority; struct tty_buffer sentinel; struct llist_head free; /* Free queue head */ atomic_t mem_used; /* In-use buffers excluding free list */ int mem_limit; struct tty_buffer *tail; /* Active buffer */ }; /* * When a break, frame error, or parity error happens, these codes are * stuffed into the flags buffer. */ #define TTY_NORMAL 0 #define TTY_BREAK 1 #define TTY_FRAME 2 #define TTY_PARITY 3 #define TTY_OVERRUN 4 #endif
153 3 151 153 151 194 92 1 10 157 2 3 2 156 153 154 2 3 3 1 2 1 1 152 11 111 55 35 116 2 3 144 88 77 15 3 77 68 5 5 62 75 81 64 71 13 3 5 60 1 61 14 53 77 14 3 2 1 4 3 3 3 1 1 3 4 23 24 1 14 12 2 7 2 5 63 63 12 12 76 19 59 113 5 35 2 79 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ #include <linux/fs.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" // clang-format off #define SYSTEM_DOS_ATTRIB "system.dos_attrib" #define SYSTEM_NTFS_ATTRIB "system.ntfs_attrib" #define SYSTEM_NTFS_ATTRIB_BE "system.ntfs_attrib_be" #define SYSTEM_NTFS_SECURITY "system.ntfs_security" // clang-format on static inline size_t unpacked_ea_size(const struct EA_FULL *ea) { return ea->size ? le32_to_cpu(ea->size) : ALIGN(struct_size(ea, name, 1 + ea->name_len + le16_to_cpu(ea->elength)), 4); } static inline size_t packed_ea_size(const struct EA_FULL *ea) { return struct_size(ea, name, 1 + ea->name_len + le16_to_cpu(ea->elength)) - offsetof(struct EA_FULL, flags); } /* * find_ea * * Assume there is at least one xattr in the list. */ static inline bool find_ea(const struct EA_FULL *ea_all, u32 bytes, const char *name, u8 name_len, u32 *off, u32 *ea_sz) { u32 ea_size; *off = 0; if (!ea_all) return false; for (; *off < bytes; *off += ea_size) { const struct EA_FULL *ea = Add2Ptr(ea_all, *off); ea_size = unpacked_ea_size(ea); if (ea->name_len == name_len && !memcmp(ea->name, name, name_len)) { if (ea_sz) *ea_sz = ea_size; return true; } } return false; } /* * ntfs_read_ea - Read all extended attributes. * @ea: New allocated memory. * @info: Pointer into resident data. */ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, size_t add_bytes, const struct EA_INFO **info) { int err = -EINVAL; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTR_LIST_ENTRY *le = NULL; struct ATTRIB *attr_info, *attr_ea; void *ea_p; u32 size, off, ea_size; static_assert(le32_to_cpu(ATTR_EA_INFO) < le32_to_cpu(ATTR_EA)); *ea = NULL; *info = NULL; attr_info = ni_find_attr(ni, NULL, &le, ATTR_EA_INFO, NULL, 0, NULL, NULL); attr_ea = ni_find_attr(ni, attr_info, &le, ATTR_EA, NULL, 0, NULL, NULL); if (!attr_ea || !attr_info) return 0; *info = resident_data_ex(attr_info, sizeof(struct EA_INFO)); if (!*info) goto out; /* Check Ea limit. */ size = le32_to_cpu((*info)->size); if (size > sbi->ea_max_size) { err = -EFBIG; goto out; } if (attr_size(attr_ea) > sbi->ea_max_size) { err = -EFBIG; goto out; } if (!size) { /* EA info persists, but xattr is empty. Looks like EA problem. */ goto out; } /* Allocate memory for packed Ea. */ ea_p = kmalloc(size_add(size, add_bytes), GFP_NOFS); if (!ea_p) return -ENOMEM; if (attr_ea->non_res) { struct runs_tree run; run_init(&run); err = attr_load_runs_range(ni, ATTR_EA, NULL, 0, &run, 0, size); if (!err) err = ntfs_read_run_nb(sbi, &run, 0, ea_p, size, NULL); run_close(&run); if (err) goto out1; } else { void *p = resident_data_ex(attr_ea, size); if (!p) goto out1; memcpy(ea_p, p, size); } memset(Add2Ptr(ea_p, size), 0, add_bytes); err = -EINVAL; /* Check all attributes for consistency. */ for (off = 0; off < size; off += ea_size) { const struct EA_FULL *ef = Add2Ptr(ea_p, off); u32 bytes = size - off; /* Check if we can use field ea->size. */ if (bytes < sizeof(ef->size)) goto out1; if (ef->size) { ea_size = le32_to_cpu(ef->size); if (ea_size > bytes) goto out1; continue; } /* Check if we can use fields ef->name_len and ef->elength. */ if (bytes < offsetof(struct EA_FULL, name)) goto out1; ea_size = ALIGN(struct_size(ef, name, 1 + ef->name_len + le16_to_cpu(ef->elength)), 4); if (ea_size > bytes) goto out1; } *ea = ea_p; return 0; out1: kfree(ea_p); out: ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); return err; } /* * ntfs_list_ea * * Copy a list of xattrs names into the buffer * provided, or compute the buffer size required. * * Return: * * Number of bytes used / required on * * -ERRNO - on failure */ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, size_t bytes_per_buffer) { const struct EA_INFO *info; struct EA_FULL *ea_all = NULL; u32 off, size; int err; size_t ret; err = ntfs_read_ea(ni, &ea_all, 0, &info); if (err) return err; if (!info || !ea_all) return 0; size = le32_to_cpu(info->size); /* Enumerate all xattrs. */ ret = 0; off = 0; while (off + sizeof(struct EA_FULL) < size) { const struct EA_FULL *ea = Add2Ptr(ea_all, off); int ea_size = unpacked_ea_size(ea); u8 name_len = ea->name_len; if (!name_len) break; if (name_len > ea_size) { ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); err = -EINVAL; /* corrupted fs. */ break; } if (buffer) { /* Check if we can use field ea->name */ if (off + ea_size > size) break; if (ret + name_len + 1 > bytes_per_buffer) { err = -ERANGE; goto out; } memcpy(buffer + ret, ea->name, name_len); buffer[ret + name_len] = 0; } ret += name_len + 1; off += ea_size; } out: kfree(ea_all); return err ? err : ret; } static int ntfs_get_ea(struct inode *inode, const char *name, size_t name_len, void *buffer, size_t size, size_t *required) { struct ntfs_inode *ni = ntfs_i(inode); const struct EA_INFO *info; struct EA_FULL *ea_all = NULL; const struct EA_FULL *ea; u32 off, len; int err; if (!(ni->ni_flags & NI_FLAG_EA)) return -ENODATA; if (!required) ni_lock(ni); len = 0; if (name_len > 255) { err = -ENAMETOOLONG; goto out; } err = ntfs_read_ea(ni, &ea_all, 0, &info); if (err) goto out; if (!info) goto out; /* Enumerate all xattrs. */ if (!find_ea(ea_all, le32_to_cpu(info->size), name, name_len, &off, NULL)) { err = -ENODATA; goto out; } ea = Add2Ptr(ea_all, off); len = le16_to_cpu(ea->elength); if (!buffer) { err = 0; goto out; } if (len > size) { err = -ERANGE; if (required) *required = len; goto out; } memcpy(buffer, ea->name + ea->name_len + 1, len); err = 0; out: kfree(ea_all); if (!required) ni_unlock(ni); return err ? err : len; } static noinline int ntfs_set_ea(struct inode *inode, const char *name, size_t name_len, const void *value, size_t val_size, int flags, bool locked, __le16 *ea_size) { struct ntfs_inode *ni = ntfs_i(inode); struct ntfs_sb_info *sbi = ni->mi.sbi; int err; struct EA_INFO ea_info; const struct EA_INFO *info; struct EA_FULL *new_ea; struct EA_FULL *ea_all = NULL; size_t add, new_pack; u32 off, size, ea_sz; __le16 size_pack; struct ATTRIB *attr; struct ATTR_LIST_ENTRY *le; struct mft_inode *mi; struct runs_tree ea_run; u64 new_sz; void *p; if (!locked) ni_lock(ni); run_init(&ea_run); if (name_len > 255) { err = -ENAMETOOLONG; goto out; } add = ALIGN(struct_size(ea_all, name, 1 + name_len + val_size), 4); err = ntfs_read_ea(ni, &ea_all, add, &info); if (err) goto out; if (!info) { memset(&ea_info, 0, sizeof(ea_info)); size = 0; size_pack = 0; } else { memcpy(&ea_info, info, sizeof(ea_info)); size = le32_to_cpu(ea_info.size); size_pack = ea_info.size_pack; } if (info && find_ea(ea_all, size, name, name_len, &off, &ea_sz)) { struct EA_FULL *ea; if (flags & XATTR_CREATE) { err = -EEXIST; goto out; } ea = Add2Ptr(ea_all, off); /* * Check simple case when we try to insert xattr with the same value * e.g. ntfs_save_wsl_perm */ if (val_size && le16_to_cpu(ea->elength) == val_size && !memcmp(ea->name + ea->name_len + 1, value, val_size)) { /* xattr already contains the required value. */ goto out; } /* Remove current xattr. */ if (ea->flags & FILE_NEED_EA) le16_add_cpu(&ea_info.count, -1); le16_add_cpu(&ea_info.size_pack, 0 - packed_ea_size(ea)); memmove(ea, Add2Ptr(ea, ea_sz), size - off - ea_sz); size -= ea_sz; memset(Add2Ptr(ea_all, size), 0, ea_sz); ea_info.size = cpu_to_le32(size); if ((flags & XATTR_REPLACE) && !val_size) { /* Remove xattr. */ goto update_ea; } } else { if (flags & XATTR_REPLACE) { err = -ENODATA; goto out; } if (!ea_all) { ea_all = kzalloc(add, GFP_NOFS); if (!ea_all) { err = -ENOMEM; goto out; } } } /* Append new xattr. */ new_ea = Add2Ptr(ea_all, size); new_ea->size = cpu_to_le32(add); new_ea->flags = 0; new_ea->name_len = name_len; new_ea->elength = cpu_to_le16(val_size); memcpy(new_ea->name, name, name_len); new_ea->name[name_len] = 0; memcpy(new_ea->name + name_len + 1, value, val_size); new_pack = le16_to_cpu(ea_info.size_pack) + packed_ea_size(new_ea); ea_info.size_pack = cpu_to_le16(new_pack); /* New size of ATTR_EA. */ size += add; ea_info.size = cpu_to_le32(size); /* * 1. Check ea_info.size_pack for overflow. * 2. New attribute size must fit value from $AttrDef */ if (new_pack > 0xffff || size > sbi->ea_max_size) { ntfs_inode_warn( inode, "The size of extended attributes must not exceed 64KiB"); err = -EFBIG; // -EINVAL? goto out; } update_ea: if (!info) { /* Create xattr. */ if (!size) { err = 0; goto out; } err = ni_insert_resident(ni, sizeof(struct EA_INFO), ATTR_EA_INFO, NULL, 0, NULL, NULL, NULL); if (err) goto out; err = ni_insert_resident(ni, 0, ATTR_EA, NULL, 0, NULL, NULL, NULL); if (err) goto out; } new_sz = size; err = attr_set_size(ni, ATTR_EA, NULL, 0, &ea_run, new_sz, &new_sz, false, NULL); if (err) goto out; le = NULL; attr = ni_find_attr(ni, NULL, &le, ATTR_EA_INFO, NULL, 0, NULL, &mi); if (!attr) { err = -EINVAL; goto out; } if (!size) { /* Delete xattr, ATTR_EA_INFO */ ni_remove_attr_le(ni, attr, mi, le); } else { p = resident_data_ex(attr, sizeof(struct EA_INFO)); if (!p) { err = -EINVAL; goto out; } memcpy(p, &ea_info, sizeof(struct EA_INFO)); mi->dirty = true; } le = NULL; attr = ni_find_attr(ni, NULL, &le, ATTR_EA, NULL, 0, NULL, &mi); if (!attr) { err = -EINVAL; goto out; } if (!size) { /* Delete xattr, ATTR_EA */ ni_remove_attr_le(ni, attr, mi, le); } else if (attr->non_res) { err = attr_load_runs_range(ni, ATTR_EA, NULL, 0, &ea_run, 0, size); if (err) goto out; err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size, 0); if (err) goto out; } else { p = resident_data_ex(attr, size); if (!p) { err = -EINVAL; goto out; } memcpy(p, ea_all, size); mi->dirty = true; } /* Check if we delete the last xattr. */ if (size) ni->ni_flags |= NI_FLAG_EA; else ni->ni_flags &= ~NI_FLAG_EA; if (ea_info.size_pack != size_pack) ni->ni_flags |= NI_FLAG_UPDATE_PARENT; if (ea_size) *ea_size = ea_info.size_pack; mark_inode_dirty(&ni->vfs_inode); out: if (!locked) ni_unlock(ni); run_close(&ea_run); kfree(ea_all); return err; } #ifdef CONFIG_NTFS3_FS_POSIX_ACL /* * ntfs_get_acl - inode_operations::get_acl */ struct posix_acl *ntfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { struct inode *inode = d_inode(dentry); struct ntfs_inode *ni = ntfs_i(inode); const char *name; size_t name_len; struct posix_acl *acl; size_t req; int err; void *buf; /* Allocate PATH_MAX bytes. */ buf = __getname(); if (!buf) return ERR_PTR(-ENOMEM); /* Possible values of 'type' was already checked above. */ if (type == ACL_TYPE_ACCESS) { name = XATTR_NAME_POSIX_ACL_ACCESS; name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1; } else { name = XATTR_NAME_POSIX_ACL_DEFAULT; name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1; } ni_lock(ni); err = ntfs_get_ea(inode, name, name_len, buf, PATH_MAX, &req); ni_unlock(ni); /* Translate extended attribute to acl. */ if (err >= 0) { acl = posix_acl_from_xattr(&init_user_ns, buf, err); } else if (err == -ENODATA) { acl = NULL; } else { acl = ERR_PTR(err); } if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); __putname(buf); return acl; } static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, struct inode *inode, struct posix_acl *acl, int type, bool init_acl) { const char *name; size_t size, name_len; void *value; int err; int flags; umode_t mode; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: /* Do not change i_mode if we are in init_acl */ if (acl && !init_acl) { err = posix_acl_update_mode(idmap, inode, &mode, &acl); if (err) return err; } name = XATTR_NAME_POSIX_ACL_ACCESS; name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; name = XATTR_NAME_POSIX_ACL_DEFAULT; name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1; break; default: return -EINVAL; } if (!acl) { /* Remove xattr if it can be presented via mode. */ size = 0; value = NULL; flags = XATTR_REPLACE; } else { size = posix_acl_xattr_size(acl->a_count); value = kmalloc(size, GFP_NOFS); if (!value) return -ENOMEM; err = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (err < 0) goto out; flags = 0; } err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0, NULL); if (err == -ENODATA && !size) err = 0; /* Removing non existed xattr. */ if (!err) { set_cached_acl(inode, type, acl); inode->i_mode = mode; inode_set_ctime_current(inode); mark_inode_dirty(inode); } out: kfree(value); return err; } /* * ntfs_set_acl - inode_operations::set_acl */ int ntfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { return ntfs_set_acl_ex(idmap, d_inode(dentry), acl, type, false); } /* * ntfs_init_acl - Initialize the ACLs of a new inode. * * Called from ntfs_create_inode(). */ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, struct inode *dir) { struct posix_acl *default_acl, *acl; int err; err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (err) return err; if (default_acl) { err = ntfs_set_acl_ex(idmap, inode, default_acl, ACL_TYPE_DEFAULT, true); posix_acl_release(default_acl); } else { inode->i_default_acl = NULL; } if (acl) { if (!err) err = ntfs_set_acl_ex(idmap, inode, acl, ACL_TYPE_ACCESS, true); posix_acl_release(acl); } else { inode->i_acl = NULL; } return err; } #endif /* * ntfs_acl_chmod - Helper for ntfs_setattr(). */ int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; if (!(sb->s_flags & SB_POSIXACL)) return 0; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; return posix_acl_chmod(idmap, dentry, inode->i_mode); } /* * ntfs_listxattr - inode_operations::listxattr */ ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct inode *inode = d_inode(dentry); struct ntfs_inode *ni = ntfs_i(inode); ssize_t ret; if (!(ni->ni_flags & NI_FLAG_EA)) { /* no xattr in file */ return 0; } ni_lock(ni); ret = ntfs_list_ea(ni, buffer, size); ni_unlock(ni); return ret; } static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, struct inode *inode, const char *name, void *buffer, size_t size) { int err; struct ntfs_inode *ni = ntfs_i(inode); if (unlikely(ntfs3_forced_shutdown(inode->i_sb))) return -EIO; /* Dispatch request. */ if (!strcmp(name, SYSTEM_DOS_ATTRIB)) { /* system.dos_attrib */ if (!buffer) { err = sizeof(u8); } else if (size < sizeof(u8)) { err = -ENODATA; } else { err = sizeof(u8); *(u8 *)buffer = le32_to_cpu(ni->std_fa); } goto out; } if (!strcmp(name, SYSTEM_NTFS_ATTRIB) || !strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) { /* system.ntfs_attrib */ if (!buffer) { err = sizeof(u32); } else if (size < sizeof(u32)) { err = -ENODATA; } else { err = sizeof(u32); *(u32 *)buffer = le32_to_cpu(ni->std_fa); if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) *(__be32 *)buffer = cpu_to_be32(*(u32 *)buffer); } goto out; } if (!strcmp(name, SYSTEM_NTFS_SECURITY)) { /* system.ntfs_security*/ struct SECURITY_DESCRIPTOR_RELATIVE *sd = NULL; size_t sd_size = 0; if (!is_ntfs3(ni->mi.sbi)) { /* We should get nt4 security. */ err = -EINVAL; goto out; } else if (le32_to_cpu(ni->std_security_id) < SECURITY_ID_FIRST) { err = -ENOENT; goto out; } err = ntfs_get_security_by_id(ni->mi.sbi, ni->std_security_id, &sd, &sd_size); if (err) goto out; if (!is_sd_valid(sd, sd_size)) { ntfs_inode_warn( inode, "looks like you get incorrect security descriptor id=%u", ni->std_security_id); } if (!buffer) { err = sd_size; } else if (size < sd_size) { err = -ENODATA; } else { err = sd_size; memcpy(buffer, sd, sd_size); } kfree(sd); goto out; } /* Deal with NTFS extended attribute. */ err = ntfs_get_ea(inode, name, strlen(name), buffer, size, NULL); out: return err; } /* * ntfs_setxattr - inode_operations::setxattr */ static noinline int ntfs_setxattr(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *de, struct inode *inode, const char *name, const void *value, size_t size, int flags) { int err = -EINVAL; struct ntfs_inode *ni = ntfs_i(inode); enum FILE_ATTRIBUTE new_fa; /* Dispatch request. */ if (!strcmp(name, SYSTEM_DOS_ATTRIB)) { if (sizeof(u8) != size) goto out; new_fa = cpu_to_le32(*(u8 *)value); goto set_new_fa; } if (!strcmp(name, SYSTEM_NTFS_ATTRIB) || !strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) { if (size != sizeof(u32)) goto out; if (!strcmp(name, SYSTEM_NTFS_ATTRIB_BE)) new_fa = cpu_to_le32(be32_to_cpu(*(__be32 *)value)); else new_fa = cpu_to_le32(*(u32 *)value); if (S_ISREG(inode->i_mode)) { /* Process compressed/sparsed in special way. */ ni_lock(ni); err = ni_new_attr_flags(ni, new_fa); ni_unlock(ni); if (err) goto out; } set_new_fa: /* * Thanks Mark Harmstone: * Keep directory bit consistency. */ if (S_ISDIR(inode->i_mode)) new_fa |= FILE_ATTRIBUTE_DIRECTORY; else new_fa &= ~FILE_ATTRIBUTE_DIRECTORY; if (ni->std_fa != new_fa) { ni->std_fa = new_fa; if (new_fa & FILE_ATTRIBUTE_READONLY) inode->i_mode &= ~0222; else inode->i_mode |= 0222; /* Std attribute always in primary record. */ ni->mi.dirty = true; mark_inode_dirty(inode); } err = 0; goto out; } if (!strcmp(name, SYSTEM_NTFS_SECURITY)) { /* system.ntfs_security*/ __le32 security_id; bool inserted; struct ATTR_STD_INFO5 *std; if (!is_ntfs3(ni->mi.sbi)) { /* * We should replace ATTR_SECURE. * Skip this way cause it is nt4 feature. */ err = -EINVAL; goto out; } if (!is_sd_valid(value, size)) { err = -EINVAL; ntfs_inode_warn( inode, "you try to set invalid security descriptor"); goto out; } err = ntfs_insert_security(ni->mi.sbi, value, size, &security_id, &inserted); if (err) goto out; ni_lock(ni); std = ni_std5(ni); if (!std) { err = -EINVAL; } else if (std->security_id != security_id) { std->security_id = ni->std_security_id = security_id; /* Std attribute always in primary record. */ ni->mi.dirty = true; mark_inode_dirty(&ni->vfs_inode); } ni_unlock(ni); goto out; } /* Deal with NTFS extended attribute. */ err = ntfs_set_ea(inode, name, strlen(name), value, size, flags, 0, NULL); out: inode_set_ctime_current(inode); mark_inode_dirty(inode); return err; } /* * ntfs_save_wsl_perm * * save uid/gid/mode in xattr */ int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size) { int err; __le32 value; struct ntfs_inode *ni = ntfs_i(inode); ni_lock(ni); value = cpu_to_le32(i_uid_read(inode)); err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value, sizeof(value), 0, true, ea_size); if (err) goto out; value = cpu_to_le32(i_gid_read(inode)); err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value, sizeof(value), 0, true, ea_size); if (err) goto out; value = cpu_to_le32(inode->i_mode); err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value, sizeof(value), 0, true, ea_size); if (err) goto out; if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { value = cpu_to_le32(inode->i_rdev); err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value, sizeof(value), 0, true, ea_size); if (err) goto out; } out: ni_unlock(ni); /* In case of error should we delete all WSL xattr? */ return err; } /* * ntfs_get_wsl_perm * * get uid/gid/mode from xattr * it is called from ntfs_iget5->ntfs_read_mft */ void ntfs_get_wsl_perm(struct inode *inode) { size_t sz; __le32 value[3]; if (ntfs_get_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value[0], sizeof(value[0]), &sz) == sizeof(value[0]) && ntfs_get_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value[1], sizeof(value[1]), &sz) == sizeof(value[1]) && ntfs_get_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value[2], sizeof(value[2]), &sz) == sizeof(value[2])) { i_uid_write(inode, (uid_t)le32_to_cpu(value[0])); i_gid_write(inode, (gid_t)le32_to_cpu(value[1])); inode->i_mode = le32_to_cpu(value[2]); if (ntfs_get_ea(inode, "$LXDEV", sizeof("$$LXDEV") - 1, &value[0], sizeof(value), &sz) == sizeof(value[0])) { inode->i_rdev = le32_to_cpu(value[0]); } } } static bool ntfs_xattr_user_list(struct dentry *dentry) { return true; } // clang-format off static const struct xattr_handler ntfs_other_xattr_handler = { .prefix = "", .get = ntfs_getxattr, .set = ntfs_setxattr, .list = ntfs_xattr_user_list, }; const struct xattr_handler * const ntfs_xattr_handlers[] = { &ntfs_other_xattr_handler, NULL, }; // clang-format on
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 // SPDX-License-Identifier: GPL-2.0 /* dvb-usb-firmware.c is part of the DVB USB library. * * Copyright (C) 2004-6 Patrick Boettcher (patrick.boettcher@posteo.de) * see dvb-usb-init.c for copyright information. * * This file contains functions for downloading the firmware to Cypress FX 1 and 2 based devices. * * FIXME: This part does actually not belong to dvb-usb, but to the usb-subsystem. */ #include "dvb-usb-common.h" #include <linux/usb.h> struct usb_cypress_controller { int id; const char *name; /* name of the usb controller */ u16 cpu_cs_register; /* needs to be restarted, when the firmware has been downloaded. */ }; static struct usb_cypress_controller cypress[] = { { .id = DEVICE_SPECIFIC, .name = "Device specific", .cpu_cs_register = 0 }, { .id = CYPRESS_AN2135, .name = "Cypress AN2135", .cpu_cs_register = 0x7f92 }, { .id = CYPRESS_AN2235, .name = "Cypress AN2235", .cpu_cs_register = 0x7f92 }, { .id = CYPRESS_FX2, .name = "Cypress FX2", .cpu_cs_register = 0xe600 }, }; /* * load a firmware packet to the device */ static int usb_cypress_writemem(struct usb_device *udev,u16 addr,u8 *data, u8 len) { return usb_control_msg(udev, usb_sndctrlpipe(udev,0), 0xa0, USB_TYPE_VENDOR, addr, 0x00, data, len, 5000); } int usb_cypress_load_firmware(struct usb_device *udev, const struct firmware *fw, int type) { struct hexline *hx; u8 *buf; int ret, pos = 0; u16 cpu_cs_register = cypress[type].cpu_cs_register; buf = kmalloc(sizeof(*hx), GFP_KERNEL); if (!buf) return -ENOMEM; hx = (struct hexline *)buf; /* stop the CPU */ buf[0] = 1; if (usb_cypress_writemem(udev, cpu_cs_register, buf, 1) != 1) err("could not stop the USB controller CPU."); while ((ret = dvb_usb_get_hexline(fw, hx, &pos)) > 0) { deb_fw("writing to address 0x%04x (buffer: 0x%02x %02x)\n", hx->addr, hx->len, hx->chk); ret = usb_cypress_writemem(udev, hx->addr, hx->data, hx->len); if (ret != hx->len) { err("error while transferring firmware (transferred size: %d, block size: %d)", ret, hx->len); ret = -EINVAL; break; } } if (ret < 0) { err("firmware download failed at %d with %d",pos,ret); kfree(buf); return ret; } if (ret == 0) { /* restart the CPU */ buf[0] = 0; if (usb_cypress_writemem(udev, cpu_cs_register, buf, 1) != 1) { err("could not restart the USB controller CPU."); ret = -EINVAL; } } else ret = -EIO; kfree(buf); return ret; } EXPORT_SYMBOL(usb_cypress_load_firmware); int dvb_usb_download_firmware(struct usb_device *udev, const struct dvb_usb_device_properties *props) { int ret; const struct firmware *fw = NULL; if ((ret = request_firmware(&fw, props->firmware, &udev->dev)) != 0) { err("did not find the firmware file '%s' (status %d). You can use <kernel_dir>/scripts/get_dvb_firmware to get the firmware", props->firmware,ret); return ret; } info("downloading firmware from file '%s'",props->firmware); switch (props->usb_ctrl) { case CYPRESS_AN2135: case CYPRESS_AN2235: case CYPRESS_FX2: ret = usb_cypress_load_firmware(udev, fw, props->usb_ctrl); break; case DEVICE_SPECIFIC: if (props->download_firmware) ret = props->download_firmware(udev,fw); else { err("BUG: driver didn't specified a download_firmware-callback, although it claims to have a DEVICE_SPECIFIC one."); ret = -EINVAL; } break; default: ret = -EINVAL; break; } release_firmware(fw); return ret; } int dvb_usb_get_hexline(const struct firmware *fw, struct hexline *hx, int *pos) { u8 *b = (u8 *) &fw->data[*pos]; int data_offs = 4; if (*pos >= fw->size) return 0; memset(hx,0,sizeof(struct hexline)); hx->len = b[0]; if ((*pos + hx->len + 4) >= fw->size) return -EINVAL; hx->addr = b[1] | (b[2] << 8); hx->type = b[3]; if (hx->type == 0x04) { /* b[4] and b[5] are the Extended linear address record data field */ hx->addr |= (b[4] << 24) | (b[5] << 16); /* hx->len -= 2; data_offs += 2; */ } memcpy(hx->data,&b[data_offs],hx->len); hx->chk = b[hx->len + data_offs]; *pos += hx->len + 5; return *pos; } EXPORT_SYMBOL(dvb_usb_get_hexline);
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,net type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Range as input support for IPv4 added */ /* 3 nomatch flag support added */ /* 4 Counters support added */ /* 5 Comments support added */ /* 6 Forceadd support added */ /* 7 skbinfo support added */ #define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,net"); /* Type specific function prefix */ #define HTYPE hash_ipportnet /* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 * However this way we have to store internally cidr - 1, * dancing back and forth. */ #define IP_SET_HASH_WITH_NETS_PACKED #define IP_SET_HASH_WITH_PROTO #define IP_SET_HASH_WITH_NETS /* IPv4 variant */ /* Member elements */ struct hash_ipportnet4_elem { __be32 ip; __be32 ip2; __be16 port; u8 cidr:7; u8 nomatch:1; u8 proto; }; /* Common functions */ static bool hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, const struct hash_ipportnet4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->ip2 == ip2->ip2 && ip1->cidr == ip2->cidr && ip1->port == ip2->port && ip1->proto == ip2->proto; } static int hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) { elem->ip2 &= ip_set_netmask(cidr); elem->cidr = cidr - 1; } static bool hash_ipportnet4_data_list(struct sk_buff *skb, const struct hash_ipportnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, const struct hash_ipportnet4_elem *d) { next->ip = d->ip; next->port = d->port; next->ip2 = d->ip2; } #define MTYPE hash_ipportnet4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK - 1; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); e.ip2 &= ip_set_netmask(e.cidr + 1); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; u8 cidr; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); if (ret) return ret; if (tb[IPSET_ATTR_CIDR2]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; e.cidr = cidr - 1; } e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) { e.ip = htonl(ip); e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1)); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); if (ret) return ret; if (ip2_from > ip2_to) swap(ip2_from, ip2_to); if (ip2_from + UINT_MAX == ip2_to) return -IPSET_ERR_HASH_RANGE; } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); } if (retried) { ip = ntohl(h->next.ip); p = ntohs(h->next.port); ip2 = ntohl(h->next.ip2); } else { p = port; ip2 = ip2_from; } for (; ip <= ip_to; ip++) { e.ip = htonl(ip); for (; p <= port_to; p++) { e.port = htons(p); do { i++; e.ip2 = htonl(ip2); ip2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr); e.cidr = cidr - 1; if (i > IPSET_MAX_RANGE) { hash_ipportnet4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } while (ip2++ < ip2_to); ip2 = ip2_from; } p = port; } return ret; } /* IPv6 variant */ struct hash_ipportnet6_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 cidr:7; u8 nomatch:1; u8 proto; }; /* Common functions */ static bool hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, const struct hash_ipportnet6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && ip1->cidr == ip2->cidr && ip1->port == ip2->port && ip1->proto == ip2->proto; } static int hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip2, cidr); elem->cidr = cidr - 1; } static bool hash_ipportnet6_data_list(struct sk_buff *skb, const struct hash_ipportnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next, const struct hash_ipportnet6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipportnet6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipportnet6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet6_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK - 1; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); ip6_netmask(&e.ip2, e.cidr + 1); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipportnet6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; u8 cidr; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; if (tb[IPSET_ATTR_CIDR2]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; e.cidr = cidr - 1; } ip6_netmask(&e.ip2, e.cidr + 1); e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_ipportnet_type __read_mostly = { .name = "hash:ip,port,net", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_THREE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipportnet_init(void) { return ip_set_type_register(&hash_ipportnet_type); } static void __exit hash_ipportnet_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipportnet_type); } module_init(hash_ipportnet_init); module_exit(hash_ipportnet_fini);
248 250 250 252 250 249 250 251 252 306 106 105 1 106 90 1 89 271 1 18 127 177 20 263 263 258 5 162 145 258 3 1 259 4 264 227 31 12 348 2 72 276 338 14 208 57 8 187 143 139 4 138 73 64 135 2 134 135 7 128 74 57 66 64 135 179 3 12 168 167 177 183 397 25 3 368 8 1 32 6 3 378 125 313 318 34 131 75 92 262 9 140 81 80 1 262 15 15 15 14 1 14 24 13 14 32 2 1 1 14 19 29 1432 1434 1203 267 1066 1069 6 1 5 188 29 161 143 71 142 159 1 131 40 63 46 80 27 156 127 2 13 1 62 2 30 1 17 203 18 159 27 195 194 9 9 216 215 534 533 42 42 42 36 5 2 35 4 38 3 38 8 38 3 4 6 29 7 7 21 29 4 29 29 4 4 36 35 6 3 7 2 18 18 18 18 2 2 35 12 5 7 45 45 1 1 45 45 45 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * PF_INET protocol family socket handler. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> * Alan Cox, <A.Cox@swansea.ac.uk> * * Changes (see also sock.c) * * piggy, * Karl Knutson : Socket protocol table * A.N.Kuznetsov : Socket death error in accept(). * John Richardson : Fix non blocking error in connect() * so sockets that fail to connect * don't return -EINPROGRESS. * Alan Cox : Asynchronous I/O support * Alan Cox : Keep correct socket pointer on sock * structures * when accept() ed * Alan Cox : Semantics of SO_LINGER aren't state * moved to close when you look carefully. * With this fixed and the accept bug fixed * some RPC stuff seems happier. * Niibe Yutaka : 4.4BSD style write async I/O * Alan Cox, * Tony Gale : Fixed reuse semantics. * Alan Cox : bind() shouldn't abort existing but dead * sockets. Stops FTP netin:.. I hope. * Alan Cox : bind() works correctly for RAW sockets. * Note that FreeBSD at least was broken * in this respect so be careful with * compatibility tests... * Alan Cox : routing cache support * Alan Cox : memzero the socket structure for * compactness. * Matt Day : nonblock connect error handler * Alan Cox : Allow large numbers of pending sockets * (eg for big web sites), but only if * specifically application requested. * Alan Cox : New buffering throughout IP. Used * dumbly. * Alan Cox : New buffering now used smartly. * Alan Cox : BSD rather than common sense * interpretation of listen. * Germano Caronni : Assorted small races. * Alan Cox : sendmsg/recvmsg basic support. * Alan Cox : Only sendmsg/recvmsg now supported. * Alan Cox : Locked down bind (see security list). * Alan Cox : Loosened bind a little. * Mike McLagan : ADD/DEL DLCI Ioctls * Willy Konynenberg : Transparent proxying support. * David S. Miller : New socket lookup architecture. * Some other random speedups. * Cyrus Durgin : Cleaned up file for kmod hacks. * Andi Kleen : Fix inet_stream_connect TCP race. */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/err.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/sched.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/capability.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/igmp.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <net/checksum.h> #include <net/ip.h> #include <net/protocol.h> #include <net/arp.h> #include <net/route.h> #include <net/ip_fib.h> #include <net/inet_connection_sock.h> #include <net/gro.h> #include <net/gso.h> #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> #include <net/ping.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/raw.h> #include <net/icmp.h> #include <net/inet_common.h> #include <net/ip_tunnels.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/secure_seq.h> #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> #endif #include <net/l3mdev.h> #include <net/compat.h> #include <net/rps.h> #include <trace/events/sock.h> /* The inetsw table contains everything that inet_create needs to * build a new socket. */ static struct list_head inetsw[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw_lock); /* New destruction routine */ void inet_sock_destruct(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); __skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_error_queue); sk_mem_reclaim_final(sk); if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { pr_err("Attempt to release TCP socket in state %d %p\n", sk->sk_state, sk); return; } if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive inet socket %p\n", sk); return; } WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc)); WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); WARN_ON_ONCE(sk->sk_wmem_queued); WARN_ON_ONCE(sk_forward_alloc_get(sk)); kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); } EXPORT_SYMBOL(inet_sock_destruct); /* * The routines beyond this point handle the behaviour of an AF_INET * socket object. Mostly it punts to the subprotocols of IP to do * the work. */ /* * Automatically bind an unbound socket. */ static int inet_autobind(struct sock *sk) { struct inet_sock *inet; /* We may need to bind the socket. */ lock_sock(sk); inet = inet_sk(sk); if (!inet->inet_num) { if (sk->sk_prot->get_port(sk, 0)) { release_sock(sk); return -EAGAIN; } inet->inet_sport = htons(inet->inet_num); } release_sock(sk); return 0; } int __inet_listen_sk(struct sock *sk, int backlog) { unsigned char old_state = sk->sk_state; int err, tcp_fastopen; if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) return -EINVAL; WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { /* Enable TFO w/o requiring TCP_FASTOPEN socket option. * Note that only TCP sockets (SOCK_STREAM) will reach here. * Also fastopen backlog may already been set via the option * because the socket was in TCP_LISTEN state previously but * was shutdown() rather than close(). */ tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen); if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); tcp_fastopen_init_key_once(sock_net(sk)); } err = inet_csk_listen_start(sk); if (err) return err; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } return 0; } /* * Move a socket into listening state. */ int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int err = -EINVAL; lock_sock(sk); if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto out; err = __inet_listen_sk(sk, backlog); out: release_sock(sk); return err; } EXPORT_SYMBOL(inet_listen); /* * Create an inet socket. */ static int inet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct inet_protosw *answer; struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; int try_loading_module = 0; int err; if (protocol < 0 || protocol >= IPPROTO_MAX) return -EINVAL; sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { err = 0; /* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) break; } else { /* Check for the two wild cases. */ if (IPPROTO_IP == protocol) { protocol = answer->protocol; break; } if (IPPROTO_IP == answer->protocol) break; } err = -EPROTONOSUPPORT; } if (unlikely(err)) { if (try_loading_module < 2) { rcu_read_unlock(); /* * Be more specific, e.g. net-pf-2-proto-132-type-1 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) */ if (++try_loading_module == 1) request_module("net-pf-%d-proto-%d-type-%d", PF_INET, protocol, sock->type); /* * Fall back to generic, e.g. net-pf-2-proto-132 * (net-pf-PF_INET-proto-IPPROTO_SCTP) */ else request_module("net-pf-%d-proto-%d", PF_INET, protocol); goto lookup_protocol; } else goto out_rcu_unlock; } err = -EPERM; if (sock->type == SOCK_RAW && !kern && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; answer_flags = answer->flags; rcu_read_unlock(); WARN_ON(!answer_prot->slab); err = -ENOMEM; sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; err = 0; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; if (INET_PROTOSW_ICSK & answer_flags) inet_init_csk_locks(sk); inet = inet_sk(sk); inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); inet_clear_bit(NODEFRAG, sk); if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) inet_set_bit(HDRINCL, sk); } if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; atomic_set(&inet->inet_id, 0); sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); inet->uc_ttl = -1; inet_set_bit(MC_LOOP, sk); inet->mc_ttl = 1; inet_set_bit(MC_ALL, sk); inet->mc_index = 0; inet->mc_list = NULL; inet->rcv_tos = 0; if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ err = sk->sk_prot->hash(sk); if (err) goto out_sk_release; } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); if (err) goto out_sk_release; } if (!kern) { err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); if (err) goto out_sk_release; } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; out_sk_release: sk_common_release(sk); sock->sk = NULL; goto out; } /* * The peer socket should always be NULL (or else). When we call this * function we are destroying the object and from then on nobody * should refer to it. */ int inet_release(struct socket *sock) { struct sock *sk = sock->sk; if (sk) { long timeout; if (!sk->sk_kern_sock) BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk); /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); /* If linger is set, we don't return until the close * is complete. Otherwise we return immediately. The * actually closing is done the same either way. * * If the close is due to the process exiting, we never * linger.. */ timeout = 0; if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) timeout = sk->sk_lingertime; sk->sk_prot->close(sk, timeout); sock->sk = NULL; } return 0; } EXPORT_SYMBOL(inet_release); int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) { u32 flags = BIND_WITH_LOCK; int err; /* If the socket has its own bind function then use it. (RAW) */ if (sk->sk_prot->bind) { return sk->sk_prot->bind(sk, uaddr, addr_len); } if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len, CGROUP_INET4_BIND, &flags); if (err) return err; return __inet_bind(sk, uaddr, addr_len, flags); } int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { return inet_bind_sk(sock->sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_bind); int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); unsigned short snum; int chk_addr_ret; u32 tb_id = RT_TABLE_LOCAL; int err; if (addr->sin_family != AF_INET) { /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) * only if s_addr is INADDR_ANY. */ err = -EAFNOSUPPORT; if (addr->sin_family != AF_UNSPEC || addr->sin_addr.s_addr != htonl(INADDR_ANY)) goto out; } tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since * allowing applications to make a non-local bind solves * several problems with systems using dynamic addressing. * (ie. your servers still start up even if your ISDN link * is temporarily down) */ err = -EADDRNOTAVAIL; if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr, chk_addr_ret)) goto out; snum = ntohs(addr->sin_port); err = -EACCES; if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; /* We keep a pair of addresses. rcv_saddr is the one * used by hash lookups, and saddr is used for transmit. * * In the BSD API these are the same except where it * would be illegal to use them (multicast/broadcast) in * which case the sending device address is used. */ if (flags & BIND_WITH_LOCK) lock_sock(sk); /* Check these errors (active socket, double bind). */ err = -EINVAL; if (sk->sk_state != TCP_CLOSE || inet->inet_num) goto out_release_sock; inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) || (flags & BIND_FORCE_ADDRESS_NO_PORT))) { err = sk->sk_prot->get_port(sk, snum); if (err) { inet->inet_saddr = inet->inet_rcv_saddr = 0; goto out_release_sock; } if (!(flags & BIND_FROM_BPF)) { err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); if (err) { inet->inet_saddr = inet->inet_rcv_saddr = 0; if (sk->sk_prot->put_port) sk->sk_prot->put_port(sk); goto out_release_sock; } } } if (inet->inet_rcv_saddr) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; inet->inet_sport = htons(inet->inet_num); inet->inet_daddr = 0; inet->inet_dport = 0; sk_dst_reset(sk); err = 0; out_release_sock: if (flags & BIND_WITH_LOCK) release_sock(sk); out: return err; } int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; const struct proto *prot; int err; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); if (uaddr->sa_family == AF_UNSPEC) return prot->disconnect(sk, flags); if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { err = prot->pre_connect(sk, uaddr, addr_len); if (err) return err; } if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk)) return -EAGAIN; return prot->connect(sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_dgram_connect); static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending += writebias; /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. * Connect() does not allow to get error notifications * without closing the socket. */ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { release_sock(sk); timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); if (signal_pending(current) || !timeo) break; } remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending -= writebias; return timeo; } /* * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags, int is_sendmsg) { struct sock *sk = sock->sk; int err; long timeo; /* * uaddr can be NULL and addr_len can be 0 if: * sk is a TCP fastopen active socket and * TCP_FASTOPEN_CONNECT sockopt is set and * we already have a valid cookie for this socket. * In this case, user can call write() after connect(). * write() will invoke tcp_sendmsg_fastopen() which calls * __inet_stream_connect(). */ if (uaddr) { if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) { sk->sk_disconnects++; err = sk->sk_prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; goto out; } } switch (sock->state) { default: err = -EINVAL; goto out; case SS_CONNECTED: err = -EISCONN; goto out; case SS_CONNECTING: if (inet_test_bit(DEFER_CONNECT, sk)) err = is_sendmsg ? -EINPROGRESS : -EISCONN; else err = -EALREADY; /* Fall out of switch with err, set for this state */ break; case SS_UNCONNECTED: err = -EISCONN; if (sk->sk_state != TCP_CLOSE) goto out; if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); if (err) goto out; } err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; sock->state = SS_CONNECTING; if (!err && inet_test_bit(DEFER_CONNECT, sk)) goto out; /* Just entered SS_CONNECTING state; the only * difference is that return value in non-blocking * case is EINPROGRESS, rather than EALREADY. */ err = -EINPROGRESS; break; } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { int writebias = (sk->sk_protocol == IPPROTO_TCP) && tcp_sk(sk)->fastopen_req && tcp_sk(sk)->fastopen_req->data ? 1 : 0; int dis = sk->sk_disconnects; /* Error code is set above */ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) goto out; err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; if (dis != sk->sk_disconnects) { err = -EPIPE; goto out; } } /* Connection was closed by RST, timeout, ICMP error * or another process disconnected us. */ if (sk->sk_state == TCP_CLOSE) goto sock_error; /* sk->sk_err may be not zero now, if RECVERR was ordered by user * and error was received after socket entered established state. * Hence, it is handled normally after connect() return successfully. */ sock->state = SS_CONNECTED; err = 0; out: return err; sock_error: err = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; sk->sk_disconnects++; if (sk->sk_prot->disconnect(sk, flags)) sock->state = SS_DISCONNECTING; goto out; } EXPORT_SYMBOL(__inet_stream_connect); int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { int err; lock_sock(sock->sk); err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); release_sock(sock->sk); return err; } EXPORT_SYMBOL(inet_stream_connect); void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk) { sock_rps_record_flow(newsk); WARN_ON(!((1 << newsk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_RECV | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_CLOSE))); if (test_bit(SOCK_SUPPORT_ZC, &sock->flags)) set_bit(SOCK_SUPPORT_ZC, &newsock->flags); sock_graft(newsk, newsock); newsock->state = SS_CONNECTED; } /* * Accept a pending connection. The TCP layer now gives BSD semantics. */ int inet_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk1 = sock->sk, *sk2; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ arg->err = -EINVAL; sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, arg); if (!sk2) return arg->err; lock_sock(sk2); __inet_accept(sock, newsock, sk2); release_sock(sk2); return 0; } EXPORT_SYMBOL(inet_accept); /* * This does both peername and sockname. */ int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr); int sin_addr_len = sizeof(*sin); sin->sin_family = AF_INET; lock_sock(sk); if (peer) { if (!inet->inet_dport || (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && peer == 1)) { release_sock(sk); return -ENOTCONN; } sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, CGROUP_INET4_GETPEERNAME); } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, CGROUP_INET4_GETSOCKNAME); } release_sock(sk); memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sin_addr_len; } EXPORT_SYMBOL(inet_getname); int inet_send_prepare(struct sock *sk) { sock_rps_record_flow(sk); /* We may need to bind the socket. */ if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind && inet_autobind(sk)) return -EAGAIN; return 0; } EXPORT_SYMBOL_GPL(inet_send_prepare); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; if (unlikely(inet_send_prepare(sk))) return -EAGAIN; return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg, sk, msg, size); } EXPORT_SYMBOL(inet_sendmsg); void inet_splice_eof(struct socket *sock) { const struct proto *prot; struct sock *sk = sock->sk; if (unlikely(inet_send_prepare(sk))) return; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); if (prot->splice_eof) prot->splice_eof(sock); } EXPORT_SYMBOL_GPL(inet_splice_eof); INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *, size_t, int, int *)); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int addr_len = 0; int err; if (likely(!(flags & MSG_ERRQUEUE))) sock_rps_record_flow(sk); err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg, sk, msg, size, flags, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; } EXPORT_SYMBOL(inet_recvmsg); int inet_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int err = 0; /* This should really check to make sure * the socket is a TCP socket. (WHY AC...) */ how++; /* maps 0->1 has the advantage of making bit 1 rcvs and 1->2 bit 2 snds. 2->3 */ if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */ return -EINVAL; lock_sock(sk); if (sock->state == SS_CONNECTING) { if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) sock->state = SS_DISCONNECTING; else sock->state = SS_CONNECTED; } switch (sk->sk_state) { case TCP_CLOSE: err = -ENOTCONN; /* Hack to wake up other listeners, who can poll for EPOLLHUP, even on eg. unconnected UDP sockets -- RR */ fallthrough; default: WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how); if (sk->sk_prot->shutdown) sk->sk_prot->shutdown(sk, how); break; /* Remaining two branches are temporary solution for missing * close() in multithreaded environment. It is _not_ a good idea, * but we have no choice until close() is repaired at VFS level. */ case TCP_LISTEN: if (!(how & RCV_SHUTDOWN)) break; fallthrough; case TCP_SYN_SENT: err = sk->sk_prot->disconnect(sk, O_NONBLOCK); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; break; } /* Wake up anyone sleeping in poll. */ sk->sk_state_change(sk); release_sock(sk); return err; } EXPORT_SYMBOL(inet_shutdown); /* * ioctl() calls you can issue on an INET socket. Most of these are * device configuration and stuff and very rarely used. Some ioctls * pass on to the socket itself. * * NOTE: I like the idea of a module for the config stuff. ie ifconfig * loads the devconfigure module does its configuring and unloads it. * There's a good 20K of config code hanging around the kernel. */ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; int err = 0; struct net *net = sock_net(sk); void __user *p = (void __user *)arg; struct ifreq ifr; struct rtentry rt; switch (cmd) { case SIOCADDRT: case SIOCDELRT: if (copy_from_user(&rt, p, sizeof(struct rtentry))) return -EFAULT; err = ip_rt_ioctl(net, cmd, &rt); break; case SIOCRTMSG: err = -EINVAL; break; case SIOCDARP: case SIOCGARP: case SIOCSARP: err = arp_ioctl(net, cmd, (void __user *)arg); break; case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFNETMASK: case SIOCGIFDSTADDR: case SIOCGIFPFLAGS: if (get_user_ifreq(&ifr, NULL, p)) return -EFAULT; err = devinet_ioctl(net, cmd, &ifr); if (!err && put_user_ifreq(&ifr, p)) err = -EFAULT; break; case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFNETMASK: case SIOCSIFDSTADDR: case SIOCSIFPFLAGS: case SIOCSIFFLAGS: if (get_user_ifreq(&ifr, NULL, p)) return -EFAULT; err = devinet_ioctl(net, cmd, &ifr); break; default: if (sk->sk_prot->ioctl) err = sk_ioctl(sk, cmd, (void __user *)arg); else err = -ENOIOCTLCMD; break; } return err; } EXPORT_SYMBOL(inet_ioctl); #ifdef CONFIG_COMPAT static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd, struct compat_rtentry __user *ur) { compat_uptr_t rtdev; struct rtentry rt; if (copy_from_user(&rt.rt_dst, &ur->rt_dst, 3 * sizeof(struct sockaddr)) || get_user(rt.rt_flags, &ur->rt_flags) || get_user(rt.rt_metric, &ur->rt_metric) || get_user(rt.rt_mtu, &ur->rt_mtu) || get_user(rt.rt_window, &ur->rt_window) || get_user(rt.rt_irtt, &ur->rt_irtt) || get_user(rtdev, &ur->rt_dev)) return -EFAULT; rt.rt_dev = compat_ptr(rtdev); return ip_rt_ioctl(sock_net(sk), cmd, &rt); } static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); struct sock *sk = sock->sk; switch (cmd) { case SIOCADDRT: case SIOCDELRT: return inet_compat_routing_ioctl(sk, cmd, argp); default: if (!sk->sk_prot->compat_ioctl) return -ENOIOCTLCMD; return sk->sk_prot->compat_ioctl(sk, cmd, arg); } } #endif /* CONFIG_COMPAT */ const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, .poll = tcp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif .splice_eof = inet_splice_eof, .splice_read = tcp_splice_read, .set_peek_off = sk_set_peek_off, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, .sendmsg_locked = tcp_sendmsg_locked, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, #endif .set_rcvlowat = tcp_set_rcvlowat, }; EXPORT_SYMBOL(inet_stream_ops); const struct proto_ops inet_dgram_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, .poll = udp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .read_skb = udp_read_skb, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .splice_eof = inet_splice_eof, .set_peek_off = udp_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, #endif }; EXPORT_SYMBOL(inet_dgram_ops); /* * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without * udp_poll */ static const struct proto_ops inet_sockraw_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, .poll = datagram_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .splice_eof = inet_splice_eof, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, #endif }; static const struct net_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create, .owner = THIS_MODULE, }; /* Upon startup we insert all the elements in inetsw_array[] into * the linked list inetsw. */ static struct inet_protosw inetsw_array[] = { { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, .flags = INET_PROTOSW_PERMANENT, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_ICMP, .prot = &ping_prot, .ops = &inet_sockraw_ops, .flags = INET_PROTOSW_REUSE, }, { .type = SOCK_RAW, .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, .flags = INET_PROTOSW_REUSE, } }; #define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array) void inet_register_protosw(struct inet_protosw *p) { struct list_head *lh; struct inet_protosw *answer; int protocol = p->protocol; struct list_head *last_perm; spin_lock_bh(&inetsw_lock); if (p->type >= SOCK_MAX) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ last_perm = &inetsw[p->type]; list_for_each(lh, &inetsw[p->type]) { answer = list_entry(lh, struct inet_protosw, list); /* Check only the non-wild match. */ if ((INET_PROTOSW_PERMANENT & answer->flags) == 0) break; if (protocol == answer->protocol) goto out_permanent; last_perm = lh; } /* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with * a wild-card protocol. But it is allowed to override any existing * non-permanent entry. This means that when we remove this entry, the * system automatically returns to the old behavior. */ list_add_rcu(&p->list, last_perm); out: spin_unlock_bh(&inetsw_lock); return; out_permanent: pr_err("Attempt to override permanent protocol %d\n", protocol); goto out; out_illegal: pr_err("Ignoring attempt to register invalid socket type %d\n", p->type); goto out; } EXPORT_SYMBOL(inet_register_protosw); void inet_unregister_protosw(struct inet_protosw *p) { if (INET_PROTOSW_PERMANENT & p->flags) { pr_err("Attempt to unregister permanent protocol %d\n", p->protocol); } else { spin_lock_bh(&inetsw_lock); list_del_rcu(&p->list); spin_unlock_bh(&inetsw_lock); synchronize_net(); } } EXPORT_SYMBOL(inet_unregister_protosw); static int inet_sk_reselect_saddr(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); __be32 old_saddr = inet->inet_saddr; __be32 daddr = inet->inet_daddr; struct flowi4 *fl4; struct rtable *rt; __be32 new_saddr; struct ip_options_rcu *inet_opt; int err; inet_opt = rcu_dereference_protected(inet->inet_opt, lockdep_sock_is_held(sk)); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; /* Query new route. */ fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, daddr, 0, sk->sk_bound_dev_if, sk->sk_protocol, inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) return PTR_ERR(rt); new_saddr = fl4->saddr; if (new_saddr == old_saddr) { sk_setup_caps(sk, &rt->dst); return 0; } err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET); if (err) { ip_rt_put(rt); return err; } sk_setup_caps(sk, &rt->dst); if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) { pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", __func__, &old_saddr, &new_saddr); } /* * XXX The only one ugly spot where we need to * XXX really change the sockets identity after * XXX it has entered the hashes. -DaveM * * Besides that, it does not check for connection * uniqueness. Wait for troubles. */ return __sk_prot_rehash(sk); } int inet_sk_rebuild_header(struct sock *sk) { struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0)); struct inet_sock *inet = inet_sk(sk); __be32 daddr; struct ip_options_rcu *inet_opt; struct flowi4 *fl4; int err; /* Route is OK, nothing to do. */ if (rt) return 0; /* Reroute. */ rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); daddr = inet->inet_daddr; if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; rcu_read_unlock(); fl4 = &inet->cork.fl.u.ip4; rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, ip_sock_rt_tos(sk), sk->sk_bound_dev_if); if (!IS_ERR(rt)) { err = 0; sk_setup_caps(sk, &rt->dst); } else { err = PTR_ERR(rt); /* Routing failed... */ sk->sk_route_caps = 0; /* * Other protocols have to map its equivalent state to TCP_SYN_SENT. * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme */ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) || sk->sk_state != TCP_SYN_SENT || (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || (err = inet_sk_reselect_saddr(sk)) != 0) WRITE_ONCE(sk->sk_err_soft, -err); } return err; } EXPORT_SYMBOL(inet_sk_rebuild_header); void inet_sk_set_state(struct sock *sk, int state) { trace_inet_sock_set_state(sk, sk->sk_state, state); sk->sk_state = state; } EXPORT_SYMBOL(inet_sk_set_state); void inet_sk_state_store(struct sock *sk, int newstate) { trace_inet_sock_set_state(sk, sk->sk_state, newstate); smp_store_release(&sk->sk_state, newstate); } struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { bool udpfrag = false, fixedid = false, gso_partial, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; unsigned int offset = 0; struct iphdr *iph; int proto, tot_len; int nhoff; int ihl; int id; skb_reset_network_header(skb); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) goto out; iph = ip_hdr(skb); ihl = iph->ihl * 4; if (ihl < sizeof(*iph)) goto out; id = ntohs(iph->id); proto = iph->protocol; /* Warning: after this point, iph might be no longer valid */ if (unlikely(!pskb_may_pull(skb, ihl))) goto out; __skb_pull(skb, ihl); encap = SKB_GSO_CB(skb)->encap_level > 0; if (encap) features &= skb->dev->hw_enc_features; SKB_GSO_CB(skb)->encap_level += ihl; skb_reset_transport_header(skb); segs = ERR_PTR(-EPROTONOSUPPORT); if (!skb->encapsulation || encap) { udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); /* fixed ID is invalid if DF bit is not set */ if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) goto out; } ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { segs = ops->callbacks.gso_segment(skb, features); if (!segs) skb->network_header = skb_mac_header(skb) + nhoff - skb->head; } if (IS_ERR_OR_NULL(segs)) goto out; gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); if (udpfrag) { iph->frag_off = htons(offset >> 3); if (skb->next) iph->frag_off |= htons(IP_MF); offset += skb->len - nhoff - ihl; tot_len = skb->len - nhoff; } else if (skb_is_gso(skb)) { if (!fixedid) { iph->id = htons(id); id += skb_shinfo(skb)->gso_segs; } if (gso_partial) tot_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)iph; else tot_len = skb->len - nhoff; } else { if (!fixedid) iph->id = htons(id++); tot_len = skb->len - nhoff; } iph->tot_len = htons(tot_len); ip_send_check(iph); if (encap) skb_reset_inner_headers(skb); skb->network_header = (u8 *)iph - skb->head; skb_reset_mac_len(skb); } while ((skb = skb->next)); out: return segs; } static struct sk_buff *ipip_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4)) return ERR_PTR(-EINVAL); return inet_gso_segment(skb, features); } struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff *pp = NULL; const struct iphdr *iph; struct sk_buff *p; unsigned int hlen; unsigned int off; int flush = 1; int proto; off = skb_gro_offset(skb); hlen = off + sizeof(*iph); iph = skb_gro_header(skb, hlen, off); if (unlikely(!iph)) goto out; proto = iph->protocol; ops = rcu_dereference(inet_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out; if (*(u8 *)iph != 0x45) goto out; if (ip_is_fragment(iph)) goto out; if (unlikely(ip_fast_csum((u8 *)iph, 5))) goto out; NAPI_GRO_CB(skb)->proto = proto; flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF)); list_for_each_entry(p, head, list) { struct iphdr *iph2; if (!NAPI_GRO_CB(p)->same_flow) continue; iph2 = (struct iphdr *)(p->data + off); /* The above works because, with the exception of the top * (inner most) layer, we only aggregate pkts with the same * hdr length so all the hdrs we'll need to verify will start * at the same offset. */ if ((iph->protocol ^ iph2->protocol) | ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } NAPI_GRO_CB(skb)->flush |= flush; NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off; /* Note : No need to call skb_gro_postpull_rcsum() here, * as we already checked checksum over ipv4 header was 0 */ skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive, ops->callbacks.gro_receive, head, skb); out: skb_gro_flush_final(skb, pp, flush); return pp; } static struct sk_buff *ipip_gro_receive(struct list_head *head, struct sk_buff *skb) { if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } NAPI_GRO_CB(skb)->encap_mark = 1; return inet_gro_receive(head, skb); } #define SECONDS_PER_DAY 86400 /* inet_current_timestamp - Return IP network timestamp * * Return milliseconds since midnight in network byte order. */ __be32 inet_current_timestamp(void) { u32 secs; u32 msecs; struct timespec64 ts; ktime_get_real_ts64(&ts); /* Get secs since midnight. */ (void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs); /* Convert to msecs. */ msecs = secs * MSEC_PER_SEC; /* Convert nsec to msec. */ msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC; /* Convert to network byte order. */ return htonl(msecs); } EXPORT_SYMBOL(inet_current_timestamp); int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { unsigned int family = READ_ONCE(sk->sk_family); if (family == AF_INET) return ip_recv_error(sk, msg, len, addr_len); #if IS_ENABLED(CONFIG_IPV6) if (family == AF_INET6) return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len); #endif return -EINVAL; } EXPORT_SYMBOL(inet_recv_error); int inet_gro_complete(struct sk_buff *skb, int nhoff) { struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); const struct net_offload *ops; __be16 totlen = iph->tot_len; int proto = iph->protocol; int err = -ENOSYS; if (skb->encapsulation) { skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP)); skb_set_inner_network_header(skb, nhoff); } iph_set_totlen(iph, skb->len - nhoff); csum_replace2(&iph->check, totlen, iph->tot_len); ops = rcu_dereference(inet_offloads[proto]); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; /* Only need to add sizeof(*iph) to get to the next hdr below * because any hdr with option will have been flushed in * inet_gro_receive(). */ err = INDIRECT_CALL_2(ops->callbacks.gro_complete, tcp4_gro_complete, udp4_gro_complete, skb, nhoff + sizeof(*iph)); out: return err; } static int ipip_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; return inet_gro_complete(skb, nhoff); } int inet_ctl_sock_create(struct sock **sk, unsigned short family, unsigned short type, unsigned char protocol, struct net *net) { struct socket *sock; int rc = sock_create_kern(net, family, type, protocol, &sock); if (rc == 0) { *sk = sock->sk; (*sk)->sk_allocation = GFP_ATOMIC; (*sk)->sk_use_task_frag = false; /* * Unhash it so that IP input processing does not even see it, * we do not wish this socket to see incoming packets. */ (*sk)->sk_prot->unhash(*sk); } return rc; } EXPORT_SYMBOL_GPL(inet_ctl_sock_create); unsigned long snmp_fold_field(void __percpu *mib, int offt) { unsigned long res = 0; int i; for_each_possible_cpu(i) res += snmp_get_cpu_field(mib, i, offt); return res; } EXPORT_SYMBOL_GPL(snmp_fold_field); #if BITS_PER_LONG==32 u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt, size_t syncp_offset) { void *bhptr; struct u64_stats_sync *syncp; u64 v; unsigned int start; bhptr = per_cpu_ptr(mib, cpu); syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); do { start = u64_stats_fetch_begin(syncp); v = *(((u64 *)bhptr) + offt); } while (u64_stats_fetch_retry(syncp, start)); return v; } EXPORT_SYMBOL_GPL(snmp_get_cpu_field64); u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) { u64 res = 0; int cpu; for_each_possible_cpu(cpu) { res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset); } return res; } EXPORT_SYMBOL_GPL(snmp_fold_field64); #endif #ifdef CONFIG_IP_MULTICAST static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, }; #endif static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, .err_handler = icmp_err, .no_policy = 1, }; static __net_init int ipv4_mib_init_net(struct net *net) { int i; net->mib.tcp_statistics = alloc_percpu(struct tcp_mib); if (!net->mib.tcp_statistics) goto err_tcp_mib; net->mib.ip_statistics = alloc_percpu(struct ipstats_mib); if (!net->mib.ip_statistics) goto err_ip_mib; for_each_possible_cpu(i) { struct ipstats_mib *af_inet_stats; af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i); u64_stats_init(&af_inet_stats->syncp); } net->mib.net_statistics = alloc_percpu(struct linux_mib); if (!net->mib.net_statistics) goto err_net_mib; net->mib.udp_statistics = alloc_percpu(struct udp_mib); if (!net->mib.udp_statistics) goto err_udp_mib; net->mib.udplite_statistics = alloc_percpu(struct udp_mib); if (!net->mib.udplite_statistics) goto err_udplite_mib; net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); if (!net->mib.icmp_statistics) goto err_icmp_mib; net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), GFP_KERNEL); if (!net->mib.icmpmsg_statistics) goto err_icmpmsg_mib; tcp_mib_init(net); return 0; err_icmpmsg_mib: free_percpu(net->mib.icmp_statistics); err_icmp_mib: free_percpu(net->mib.udplite_statistics); err_udplite_mib: free_percpu(net->mib.udp_statistics); err_udp_mib: free_percpu(net->mib.net_statistics); err_net_mib: free_percpu(net->mib.ip_statistics); err_ip_mib: free_percpu(net->mib.tcp_statistics); err_tcp_mib: return -ENOMEM; } static __net_exit void ipv4_mib_exit_net(struct net *net) { kfree(net->mib.icmpmsg_statistics); free_percpu(net->mib.icmp_statistics); free_percpu(net->mib.udplite_statistics); free_percpu(net->mib.udp_statistics); free_percpu(net->mib.net_statistics); free_percpu(net->mib.ip_statistics); free_percpu(net->mib.tcp_statistics); #ifdef CONFIG_MPTCP /* allocated on demand, see mptcp_init_sock() */ free_percpu(net->mib.mptcp_statistics); #endif } static __net_initdata struct pernet_operations ipv4_mib_ops = { .init = ipv4_mib_init_net, .exit = ipv4_mib_exit_net, }; static int __init init_ipv4_mibs(void) { return register_pernet_subsys(&ipv4_mib_ops); } static __net_init int inet_init_net(struct net *net) { /* * Set defaults for local port range */ net->ipv4.ip_local_ports.range = 60999u << 16 | 32768u; seqlock_init(&net->ipv4.ping_group_range.lock); /* * Sane defaults - nobody may create ping sockets. * Boot scripts should set this to distro-specific group. */ net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); /* Default values for sysctl-controlled parameters. * We set them here, in case sysctl is not compiled. */ net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; net->ipv4.sysctl_ip_fwd_update_priority = 1; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; net->ipv4.sysctl_udp_early_demux = 1; net->ipv4.sysctl_tcp_early_demux = 1; net->ipv4.sysctl_nexthop_compat_mode = 1; #ifdef CONFIG_SYSCTL net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif /* Some igmp sysctl, whose values are always used */ net->ipv4.sysctl_igmp_max_memberships = 20; net->ipv4.sysctl_igmp_max_msf = 10; /* IGMP reports for link-local multicast groups are enabled by default */ net->ipv4.sysctl_igmp_llm_reports = 1; net->ipv4.sysctl_igmp_qrv = 2; net->ipv4.sysctl_fib_notify_on_flag_change = 0; return 0; } static __net_initdata struct pernet_operations af_inet_ops = { .init = inet_init_net, }; static int __init init_inet_pernet_ops(void) { return register_pernet_subsys(&af_inet_ops); } static int ipv4_proc_init(void); /* * IP protocol layer initialiser */ static const struct net_offload ipip_offload = { .callbacks = { .gso_segment = ipip_gso_segment, .gro_receive = ipip_gro_receive, .gro_complete = ipip_gro_complete, }, }; static int __init ipip_offload_init(void) { return inet_add_offload(&ipip_offload, IPPROTO_IPIP); } static int __init ipv4_offload_init(void) { /* * Add offloads */ if (udpv4_offload_init() < 0) pr_crit("%s: Cannot add UDP protocol offload\n", __func__); if (tcpv4_offload_init() < 0) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); if (ipip_offload_init() < 0) pr_crit("%s: Cannot add IPIP protocol offload\n", __func__); net_hotdata.ip_packet_offload = (struct packet_offload) { .type = cpu_to_be16(ETH_P_IP), .callbacks = { .gso_segment = inet_gso_segment, .gro_receive = inet_gro_receive, .gro_complete = inet_gro_complete, }, }; dev_add_offload(&net_hotdata.ip_packet_offload); return 0; } fs_initcall(ipv4_offload_init); static struct packet_type ip_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, .list_func = ip_list_rcv, }; static int __init inet_init(void) { struct inet_protosw *q; struct list_head *r; int rc; sock_skb_cb_check_size(sizeof(struct inet_skb_parm)); raw_hashinfo_init(&raw_v4_hashinfo); rc = proto_register(&tcp_prot, 1); if (rc) goto out; rc = proto_register(&udp_prot, 1); if (rc) goto out_unregister_tcp_proto; rc = proto_register(&raw_prot, 1); if (rc) goto out_unregister_udp_proto; rc = proto_register(&ping_prot, 1); if (rc) goto out_unregister_raw_proto; /* * Tell SOCKET that we are alive... */ (void)sock_register(&inet_family_ops); #ifdef CONFIG_SYSCTL ip_static_sysctl_init(); #endif /* * Add all the base protocols. */ if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) pr_crit("%s: Cannot add ICMP protocol\n", __func__); net_hotdata.udp_protocol = (struct net_protocol) { .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, }; if (inet_add_protocol(&net_hotdata.udp_protocol, IPPROTO_UDP) < 0) pr_crit("%s: Cannot add UDP protocol\n", __func__); net_hotdata.tcp_protocol = (struct net_protocol) { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, .icmp_strict_tag_validation = 1, }; if (inet_add_protocol(&net_hotdata.tcp_protocol, IPPROTO_TCP) < 0) pr_crit("%s: Cannot add TCP protocol\n", __func__); #ifdef CONFIG_IP_MULTICAST if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) pr_crit("%s: Cannot add IGMP protocol\n", __func__); #endif /* Register the socket-side information for inet_create. */ for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) INIT_LIST_HEAD(r); for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) inet_register_protosw(q); /* * Set the ARP module up */ arp_init(); /* * Set the IP module up */ ip_init(); /* Initialise per-cpu ipv4 mibs */ if (init_ipv4_mibs()) panic("%s: Cannot init ipv4 mibs\n", __func__); /* Setup TCP slab cache for open requests. */ tcp_init(); /* Setup UDP memory threshold */ udp_init(); /* Add UDP-Lite (RFC 3828) */ udplite4_register(); raw_init(); ping_init(); /* * Set the ICMP layer up */ if (icmp_init() < 0) panic("Failed to create the ICMP control socket.\n"); /* * Initialise the multicast router */ #if defined(CONFIG_IP_MROUTE) if (ip_mr_init()) pr_crit("%s: Cannot init ipv4 mroute\n", __func__); #endif if (init_inet_pernet_ops()) pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); ipv4_proc_init(); ipfrag_init(); dev_add_pack(&ip_packet_type); ip_tunnel_core_init(); rc = 0; out: return rc; out_unregister_raw_proto: proto_unregister(&raw_prot); out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: proto_unregister(&tcp_prot); goto out; } fs_initcall(inet_init); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS static int __init ipv4_proc_init(void) { int rc = 0; if (raw_proc_init()) goto out_raw; if (tcp4_proc_init()) goto out_tcp; if (udp4_proc_init()) goto out_udp; if (ping_proc_init()) goto out_ping; if (ip_misc_proc_init()) goto out_misc; out: return rc; out_misc: ping_proc_exit(); out_ping: udp4_proc_exit(); out_udp: tcp4_proc_exit(); out_tcp: raw_proc_exit(); out_raw: rc = -ENOMEM; goto out; } #else /* CONFIG_PROC_FS */ static int __init ipv4_proc_init(void) { return 0; } #endif /* CONFIG_PROC_FS */
199 144 118 133 182 183 19 169 50 18 6 30 36 50 23 29 7 1 7 3 3 18 14 3 15 6 5 1 10 5 5 21 13 3 1 4 18 12 1 4 5 8 1 3 5 3 1 2 17 16 6 5 7 11 11 78 17 4 21 15 61 69 69 5 17 21 5 16 25 25 11 11 66 16 58 3 51 10 10 10 53 1 58 1 1 1 7 52 1 5 7 7 4 1 6 7 69 69 69 61 61 63 1 2 50 2 21 10 6 1 12 9 50 24 64 9 5 56 61 46 16 61 61 13 48 69 69 69 56 13 1 61 1 13 56 16 15 9 4 9 8 13 11 3 1 61 56 43 11 18 3 5 50 27 27 61 52 5 46 46 1 45 45 4 7 2 7 7 17 16 14 14 7 32 1 1 5 139 135 5 5 138 29 108 71 36 101 41 96 45 111 3 107 28 134 84 67 44 110 135 106 14 20 13 58 58 18 40 12 46 43 15 44 58 2 2 27 31 37 21 21 21 21 57 1 23 29 2 4 1 58 25 33 152 11 1 145 54 30 54 54 17 18 46 148 128 146 29 140 203 99 200 203 173 171 112 194 47 122 80 175 27 27 27 27 18 18 5 5 18 18 18 190 133 12 183 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 // SPDX-License-Identifier: GPL-2.0-only /* * inode.c * * PURPOSE * Inode handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * (C) 1998 Dave Boynton * (C) 1998-2004 Ben Fennema * (C) 1999-2000 Stelias Computing Inc * * HISTORY * * 10/04/98 dgb Added rudimentary directory functions * 10/07/98 Fully working udf_block_map! It works! * 11/25/98 bmap altered to better support extents * 12/06/98 blf partition support in udf_iget, udf_block_map * and udf_read_inode * 12/12/98 rewrote udf_block_map to handle next extents and descs across * block boundaries (which is not actually allowed) * 12/20/98 added support for strategy 4096 * 03/07/99 rewrote udf_block_map (again) * New funcs, inode_bmap, udf_next_aext * 04/19/99 Support for writing device EA's for major/minor # */ #include "udfdecl.h" #include <linux/mm.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/slab.h> #include <linux/crc-itu-t.h> #include <linux/mpage.h> #include <linux/uio.h> #include <linux/bio.h> #include "udf_i.h" #include "udf_sb.h" #define EXTENT_MERGE_SIZE 5 #define FE_MAPPED_PERMS (FE_PERM_U_READ | FE_PERM_U_WRITE | FE_PERM_U_EXEC | \ FE_PERM_G_READ | FE_PERM_G_WRITE | FE_PERM_G_EXEC | \ FE_PERM_O_READ | FE_PERM_O_WRITE | FE_PERM_O_EXEC) #define FE_DELETE_PERMS (FE_PERM_U_DELETE | FE_PERM_G_DELETE | \ FE_PERM_O_DELETE) struct udf_map_rq; static umode_t udf_convert_permissions(struct fileEntry *); static int udf_update_inode(struct inode *, int); static int udf_sync_inode(struct inode *inode); static int udf_alloc_i_data(struct inode *inode, size_t size); static int inode_getblk(struct inode *inode, struct udf_map_rq *map); static int udf_insert_aext(struct inode *, struct extent_position, struct kernel_lb_addr, uint32_t); static void udf_split_extents(struct inode *, int *, int, udf_pblk_t, struct kernel_long_ad *, int *); static void udf_prealloc_extents(struct inode *, int, int, struct kernel_long_ad *, int *); static void udf_merge_extents(struct inode *, struct kernel_long_ad *, int *); static int udf_update_extents(struct inode *, struct kernel_long_ad *, int, int, struct extent_position *); static int udf_get_block_wb(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create); static void __udf_clear_extent_cache(struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->cached_extent.lstart != -1) { brelse(iinfo->cached_extent.epos.bh); iinfo->cached_extent.lstart = -1; } } /* Invalidate extent cache */ static void udf_clear_extent_cache(struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); spin_lock(&iinfo->i_extent_cache_lock); __udf_clear_extent_cache(inode); spin_unlock(&iinfo->i_extent_cache_lock); } /* Return contents of extent cache */ static int udf_read_extent_cache(struct inode *inode, loff_t bcount, loff_t *lbcount, struct extent_position *pos) { struct udf_inode_info *iinfo = UDF_I(inode); int ret = 0; spin_lock(&iinfo->i_extent_cache_lock); if ((iinfo->cached_extent.lstart <= bcount) && (iinfo->cached_extent.lstart != -1)) { /* Cache hit */ *lbcount = iinfo->cached_extent.lstart; memcpy(pos, &iinfo->cached_extent.epos, sizeof(struct extent_position)); if (pos->bh) get_bh(pos->bh); ret = 1; } spin_unlock(&iinfo->i_extent_cache_lock); return ret; } /* Add extent to extent cache */ static void udf_update_extent_cache(struct inode *inode, loff_t estart, struct extent_position *pos) { struct udf_inode_info *iinfo = UDF_I(inode); spin_lock(&iinfo->i_extent_cache_lock); /* Invalidate previously cached extent */ __udf_clear_extent_cache(inode); if (pos->bh) get_bh(pos->bh); memcpy(&iinfo->cached_extent.epos, pos, sizeof(*pos)); iinfo->cached_extent.lstart = estart; switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: iinfo->cached_extent.epos.offset -= sizeof(struct short_ad); break; case ICBTAG_FLAG_AD_LONG: iinfo->cached_extent.epos.offset -= sizeof(struct long_ad); break; } spin_unlock(&iinfo->i_extent_cache_lock); } void udf_evict_inode(struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); int want_delete = 0; if (!is_bad_inode(inode)) { if (!inode->i_nlink) { want_delete = 1; udf_setsize(inode, 0); udf_update_inode(inode, IS_SYNC(inode)); } if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && inode->i_size != iinfo->i_lenExtents) { udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", inode->i_ino, inode->i_mode, (unsigned long long)inode->i_size, (unsigned long long)iinfo->i_lenExtents); } } truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); kfree(iinfo->i_data); iinfo->i_data = NULL; udf_clear_extent_cache(inode); if (want_delete) { udf_free_inode(inode); } } static void udf_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); loff_t isize = inode->i_size; if (to > isize) { truncate_pagecache(inode, isize); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); } } } static int udf_adinicb_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct inode *inode = folio->mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); BUG_ON(!folio_test_locked(folio)); BUG_ON(folio->index != 0); memcpy_from_file_folio(iinfo->i_data + iinfo->i_lenEAttr, folio, 0, i_size_read(inode)); folio_unlock(folio); mark_inode_dirty(inode); return 0; } static int udf_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) return mpage_writepages(mapping, wbc, udf_get_block_wb); return write_cache_pages(mapping, wbc, udf_adinicb_writepage, NULL); } static void udf_adinicb_read_folio(struct folio *folio) { struct inode *inode = folio->mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); loff_t isize = i_size_read(inode); folio_fill_tail(folio, 0, iinfo->i_data + iinfo->i_lenEAttr, isize); folio_mark_uptodate(folio); } static int udf_read_folio(struct file *file, struct folio *folio) { struct udf_inode_info *iinfo = UDF_I(file_inode(file)); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { udf_adinicb_read_folio(folio); folio_unlock(folio); return 0; } return mpage_read_folio(folio, udf_get_block); } static void udf_readahead(struct readahead_control *rac) { struct udf_inode_info *iinfo = UDF_I(rac->mapping->host); /* * No readahead needed for in-ICB files and udf_get_block() would get * confused for such file anyway. */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) return; mpage_readahead(rac, udf_get_block); } static int udf_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct udf_inode_info *iinfo = UDF_I(file_inode(file)); struct folio *folio; int ret; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { ret = block_write_begin(mapping, pos, len, foliop, udf_get_block); if (unlikely(ret)) udf_write_failed(mapping, pos + len); return ret; } if (WARN_ON_ONCE(pos >= PAGE_SIZE)) return -EIO; folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); *foliop = folio; if (!folio_test_uptodate(folio)) udf_adinicb_read_folio(folio); return 0; } static int udf_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = file_inode(file); loff_t last_pos; if (UDF_I(inode)->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) return generic_write_end(file, mapping, pos, len, copied, folio, fsdata); last_pos = pos + copied; if (last_pos > inode->i_size) i_size_write(inode, last_pos); folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); return copied; } static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; /* Fallback to buffered IO for in-ICB files */ if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) return 0; ret = blockdev_direct_IO(iocb, inode, iter, udf_get_block); if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE)) udf_write_failed(mapping, iocb->ki_pos + count); return ret; } static sector_t udf_bmap(struct address_space *mapping, sector_t block) { struct udf_inode_info *iinfo = UDF_I(mapping->host); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) return -EINVAL; return generic_block_bmap(mapping, block, udf_get_block); } const struct address_space_operations udf_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = udf_read_folio, .readahead = udf_readahead, .writepages = udf_writepages, .write_begin = udf_write_begin, .write_end = udf_write_end, .direct_IO = udf_direct_IO, .bmap = udf_bmap, .migrate_folio = buffer_migrate_folio, }; /* * Expand file stored in ICB to a normal one-block-file * * This function requires i_mutex held */ int udf_expand_file_adinicb(struct inode *inode) { struct folio *folio; struct udf_inode_info *iinfo = UDF_I(inode); int err; WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { down_write(&iinfo->i_data_sem); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; up_write(&iinfo->i_data_sem); mark_inode_dirty(inode); return 0; } folio = __filemap_get_folio(inode->i_mapping, 0, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, GFP_KERNEL); if (IS_ERR(folio)) return PTR_ERR(folio); if (!folio_test_uptodate(folio)) udf_adinicb_read_folio(folio); down_write(&iinfo->i_data_sem); memset(iinfo->i_data + iinfo->i_lenEAttr, 0x00, iinfo->i_lenAlloc); iinfo->i_lenAlloc = 0; if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; folio_mark_dirty(folio); folio_unlock(folio); up_write(&iinfo->i_data_sem); err = filemap_fdatawrite(inode->i_mapping); if (err) { /* Restore everything back so that we don't lose data... */ folio_lock(folio); down_write(&iinfo->i_data_sem); memcpy_from_folio(iinfo->i_data + iinfo->i_lenEAttr, folio, 0, inode->i_size); folio_unlock(folio); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; iinfo->i_lenAlloc = inode->i_size; up_write(&iinfo->i_data_sem); } folio_put(folio); mark_inode_dirty(inode); return err; } #define UDF_MAP_CREATE 0x01 /* Mapping can allocate new blocks */ #define UDF_MAP_NOPREALLOC 0x02 /* Do not preallocate blocks */ #define UDF_BLK_MAPPED 0x01 /* Block was successfully mapped */ #define UDF_BLK_NEW 0x02 /* Block was freshly allocated */ struct udf_map_rq { sector_t lblk; udf_pblk_t pblk; int iflags; /* UDF_MAP_ flags determining behavior */ int oflags; /* UDF_BLK_ flags reporting results */ }; static int udf_map_block(struct inode *inode, struct udf_map_rq *map) { int ret; struct udf_inode_info *iinfo = UDF_I(inode); if (WARN_ON_ONCE(iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)) return -EFSCORRUPTED; map->oflags = 0; if (!(map->iflags & UDF_MAP_CREATE)) { struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; struct extent_position epos = {}; int8_t etype; down_read(&iinfo->i_data_sem); ret = inode_bmap(inode, map->lblk, &epos, &eloc, &elen, &offset, &etype); if (ret < 0) goto out_read; if (ret > 0 && etype == (EXT_RECORDED_ALLOCATED >> 30)) { map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, offset); map->oflags |= UDF_BLK_MAPPED; ret = 0; } out_read: up_read(&iinfo->i_data_sem); brelse(epos.bh); return ret; } down_write(&iinfo->i_data_sem); /* * Block beyond EOF and prealloc extents? Just discard preallocation * as it is not useful and complicates things. */ if (((loff_t)map->lblk) << inode->i_blkbits >= iinfo->i_lenExtents) udf_discard_prealloc(inode); udf_clear_extent_cache(inode); ret = inode_getblk(inode, map); up_write(&iinfo->i_data_sem); return ret; } static int __udf_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int flags) { int err; struct udf_map_rq map = { .lblk = block, .iflags = flags, }; err = udf_map_block(inode, &map); if (err < 0) return err; if (map.oflags & UDF_BLK_MAPPED) { map_bh(bh_result, inode->i_sb, map.pblk); if (map.oflags & UDF_BLK_NEW) set_buffer_new(bh_result); } return 0; } int udf_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { int flags = create ? UDF_MAP_CREATE : 0; /* * We preallocate blocks only for regular files. It also makes sense * for directories but there's a problem when to drop the * preallocation. We might use some delayed work for that but I feel * it's overengineering for a filesystem like UDF. */ if (!S_ISREG(inode->i_mode)) flags |= UDF_MAP_NOPREALLOC; return __udf_get_block(inode, block, bh_result, flags); } /* * We shouldn't be allocating blocks on page writeback since we allocate them * on page fault. We can spot dirty buffers without allocated blocks though * when truncate expands file. These however don't have valid data so we can * safely ignore them. So never allocate blocks from page writeback. */ static int udf_get_block_wb(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { return __udf_get_block(inode, block, bh_result, 0); } /* Extend the file with new blocks totaling 'new_block_bytes', * return the number of extents added */ static int udf_do_extend_file(struct inode *inode, struct extent_position *last_pos, struct kernel_long_ad *last_ext, loff_t new_block_bytes) { uint32_t add; int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); struct super_block *sb = inode->i_sb; struct udf_inode_info *iinfo; int err; /* The previous extent is fake and we should not extend by anything * - there's nothing to do... */ if (!new_block_bytes && fake) return 0; iinfo = UDF_I(inode); /* Round the last extent up to a multiple of block size */ if (last_ext->extLength & (sb->s_blocksize - 1)) { last_ext->extLength = (last_ext->extLength & UDF_EXTENT_FLAG_MASK) | (((last_ext->extLength & UDF_EXTENT_LENGTH_MASK) + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1)); iinfo->i_lenExtents = (iinfo->i_lenExtents + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); } add = 0; /* Can we merge with the previous extent? */ if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == EXT_NOT_RECORDED_NOT_ALLOCATED) { add = (1 << 30) - sb->s_blocksize - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); if (add > new_block_bytes) add = new_block_bytes; new_block_bytes -= add; last_ext->extLength += add; } if (fake) { err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err < 0) goto out_err; count++; } else { struct kernel_lb_addr tmploc; uint32_t tmplen; int8_t tmptype; udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); /* * We've rewritten the last extent. If we are going to add * more extents, we may need to enter possible following * empty indirect extent. */ if (new_block_bytes) { err = udf_next_aext(inode, last_pos, &tmploc, &tmplen, &tmptype, 0); if (err < 0) goto out_err; } } iinfo->i_lenExtents += add; /* Managed to do everything necessary? */ if (!new_block_bytes) goto out; /* All further extents will be NOT_RECORDED_NOT_ALLOCATED */ last_ext->extLocation.logicalBlockNum = 0; last_ext->extLocation.partitionReferenceNum = 0; add = (1 << 30) - sb->s_blocksize; last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | add; /* Create enough extents to cover the whole hole */ while (new_block_bytes > add) { new_block_bytes -= add; err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err) goto out_err; iinfo->i_lenExtents += add; count++; } if (new_block_bytes) { last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | new_block_bytes; err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err) goto out_err; iinfo->i_lenExtents += new_block_bytes; count++; } out: /* last_pos should point to the last written extent... */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) last_pos->offset -= sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) last_pos->offset -= sizeof(struct long_ad); else return -EIO; return count; out_err: /* Remove extents we've created so far */ udf_clear_extent_cache(inode); udf_truncate_extents(inode); return err; } /* Extend the final block of the file to final_block_len bytes */ static void udf_do_extend_final_block(struct inode *inode, struct extent_position *last_pos, struct kernel_long_ad *last_ext, uint32_t new_elen) { uint32_t added_bytes; /* * Extent already large enough? It may be already rounded up to block * size... */ if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) return; added_bytes = new_elen - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); last_ext->extLength += added_bytes; UDF_I(inode)->i_lenExtents += added_bytes; udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); } static int udf_extend_file(struct inode *inode, loff_t newsize) { struct extent_position epos; struct kernel_lb_addr eloc; uint32_t elen; int8_t etype; struct super_block *sb = inode->i_sb; sector_t first_block = newsize >> sb->s_blocksize_bits, offset; loff_t new_elen; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); struct kernel_long_ad extent; int err = 0; bool within_last_ext; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else BUG(); down_write(&iinfo->i_data_sem); /* * When creating hole in file, just don't bother with preserving * preallocation. It likely won't be very useful anyway. */ udf_discard_prealloc(inode); err = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset, &etype); if (err < 0) goto out; within_last_ext = (err == 1); /* We don't expect extents past EOF... */ WARN_ON_ONCE(within_last_ext && elen > ((loff_t)offset + 1) << inode->i_blkbits); if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) || (epos.bh && epos.offset == sizeof(struct allocExtDesc))) { /* File has no extents at all or has empty last * indirect extent! Create a fake extent... */ extent.extLocation.logicalBlockNum = 0; extent.extLocation.partitionReferenceNum = 0; extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; } else { epos.offset -= adsize; err = udf_next_aext(inode, &epos, &extent.extLocation, &extent.extLength, &etype, 0); if (err <= 0) goto out; extent.extLength |= etype << 30; } new_elen = ((loff_t)offset << inode->i_blkbits) | (newsize & (sb->s_blocksize - 1)); /* File has extent covering the new size (could happen when extending * inside a block)? */ if (within_last_ext) { /* Extending file within the last file block */ udf_do_extend_final_block(inode, &epos, &extent, new_elen); } else { err = udf_do_extend_file(inode, &epos, &extent, new_elen); } if (err < 0) goto out; err = 0; out: brelse(epos.bh); up_write(&iinfo->i_data_sem); return err; } static int inode_getblk(struct inode *inode, struct udf_map_rq *map) { struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; struct extent_position prev_epos, cur_epos, next_epos; int count = 0, startnum = 0, endnum = 0; uint32_t elen = 0, tmpelen; struct kernel_lb_addr eloc, tmpeloc; int c = 1; loff_t lbcount = 0, b_off = 0; udf_pblk_t newblocknum; sector_t offset = 0; int8_t etype, tmpetype; struct udf_inode_info *iinfo = UDF_I(inode); udf_pblk_t goal = 0, pgoal = iinfo->i_location.logicalBlockNum; int lastblock = 0; bool isBeyondEOF = false; int ret = 0; prev_epos.offset = udf_file_entry_alloc_offset(inode); prev_epos.block = iinfo->i_location; prev_epos.bh = NULL; cur_epos = next_epos = prev_epos; b_off = (loff_t)map->lblk << inode->i_sb->s_blocksize_bits; /* find the extent which contains the block we are looking for. alternate between laarr[0] and laarr[1] for locations of the current extent, and the previous extent */ do { if (prev_epos.bh != cur_epos.bh) { brelse(prev_epos.bh); get_bh(cur_epos.bh); prev_epos.bh = cur_epos.bh; } if (cur_epos.bh != next_epos.bh) { brelse(cur_epos.bh); get_bh(next_epos.bh); cur_epos.bh = next_epos.bh; } lbcount += elen; prev_epos.block = cur_epos.block; cur_epos.block = next_epos.block; prev_epos.offset = cur_epos.offset; cur_epos.offset = next_epos.offset; ret = udf_next_aext(inode, &next_epos, &eloc, &elen, &etype, 1); if (ret < 0) { goto out_free; } else if (ret == 0) { isBeyondEOF = true; break; } c = !c; laarr[c].extLength = (etype << 30) | elen; laarr[c].extLocation = eloc; if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) pgoal = eloc.logicalBlockNum + ((elen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); count++; } while (lbcount + elen <= b_off); b_off -= lbcount; offset = b_off >> inode->i_sb->s_blocksize_bits; /* * Move prev_epos and cur_epos into indirect extent if we are at * the pointer to it */ ret = udf_next_aext(inode, &prev_epos, &tmpeloc, &tmpelen, &tmpetype, 0); if (ret < 0) goto out_free; ret = udf_next_aext(inode, &cur_epos, &tmpeloc, &tmpelen, &tmpetype, 0); if (ret < 0) goto out_free; /* if the extent is allocated and recorded, return the block if the extent is not a multiple of the blocksize, round up */ if (!isBeyondEOF && etype == (EXT_RECORDED_ALLOCATED >> 30)) { if (elen & (inode->i_sb->s_blocksize - 1)) { elen = EXT_RECORDED_ALLOCATED | ((elen + inode->i_sb->s_blocksize - 1) & ~(inode->i_sb->s_blocksize - 1)); iinfo->i_lenExtents = ALIGN(iinfo->i_lenExtents, inode->i_sb->s_blocksize); udf_write_aext(inode, &cur_epos, &eloc, elen, 1); } map->oflags = UDF_BLK_MAPPED; map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, offset); goto out_free; } /* Are we beyond EOF and preallocated extent? */ if (isBeyondEOF) { loff_t hole_len; if (count) { if (c) laarr[0] = laarr[1]; startnum = 1; } else { /* Create a fake extent when there's not one */ memset(&laarr[0].extLocation, 0x00, sizeof(struct kernel_lb_addr)); laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; /* Will udf_do_extend_file() create real extent from a fake one? */ startnum = (offset > 0); } /* Create extents for the hole between EOF and offset */ hole_len = (loff_t)offset << inode->i_blkbits; ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len); if (ret < 0) goto out_free; c = 0; offset = 0; count += ret; /* * Is there any real extent? - otherwise we overwrite the fake * one... */ if (count) c = !c; laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | inode->i_sb->s_blocksize; memset(&laarr[c].extLocation, 0x00, sizeof(struct kernel_lb_addr)); count++; endnum = c + 1; lastblock = 1; } else { endnum = startnum = ((count > 2) ? 2 : count); /* if the current extent is in position 0, swap it with the previous */ if (!c && count != 1) { laarr[2] = laarr[0]; laarr[0] = laarr[1]; laarr[1] = laarr[2]; c = 1; } /* if the current block is located in an extent, read the next extent */ ret = udf_next_aext(inode, &next_epos, &eloc, &elen, &etype, 0); if (ret > 0) { laarr[c + 1].extLength = (etype << 30) | elen; laarr[c + 1].extLocation = eloc; count++; startnum++; endnum++; } else if (ret == 0) lastblock = 1; else goto out_free; } /* if the current extent is not recorded but allocated, get the * block in the extent corresponding to the requested block */ if ((laarr[c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) newblocknum = laarr[c].extLocation.logicalBlockNum + offset; else { /* otherwise, allocate a new block */ if (iinfo->i_next_alloc_block == map->lblk) goal = iinfo->i_next_alloc_goal; if (!goal) { if (!(goal = pgoal)) /* XXX: what was intended here? */ goal = iinfo->i_location.logicalBlockNum + 1; } newblocknum = udf_new_block(inode->i_sb, inode, iinfo->i_location.partitionReferenceNum, goal, &ret); if (!newblocknum) goto out_free; if (isBeyondEOF) iinfo->i_lenExtents += inode->i_sb->s_blocksize; } /* if the extent the requsted block is located in contains multiple * blocks, split the extent into at most three extents. blocks prior * to requested block, requested block, and blocks after requested * block */ udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); if (!(map->iflags & UDF_MAP_NOPREALLOC)) udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); /* merge any continuous blocks in laarr */ udf_merge_extents(inode, laarr, &endnum); /* write back the new extents, inserting new extents if the new number * of extents is greater than the old number, and deleting extents if * the new number of extents is less than the old number */ ret = udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); if (ret < 0) goto out_free; map->pblk = udf_get_pblock(inode->i_sb, newblocknum, iinfo->i_location.partitionReferenceNum, 0); if (!map->pblk) { ret = -EFSCORRUPTED; goto out_free; } map->oflags = UDF_BLK_NEW | UDF_BLK_MAPPED; iinfo->i_next_alloc_block = map->lblk + 1; iinfo->i_next_alloc_goal = newblocknum + 1; inode_set_ctime_current(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); else mark_inode_dirty(inode); ret = 0; out_free: brelse(prev_epos.bh); brelse(cur_epos.bh); brelse(next_epos.bh); return ret; } static void udf_split_extents(struct inode *inode, int *c, int offset, udf_pblk_t newblocknum, struct kernel_long_ad *laarr, int *endnum) { unsigned long blocksize = inode->i_sb->s_blocksize; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; if ((laarr[*c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30) || (laarr[*c].extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) { int curr = *c; int blen = ((laarr[curr].extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits; int8_t etype = (laarr[curr].extLength >> 30); if (blen == 1) ; else if (!offset || blen == offset + 1) { laarr[curr + 2] = laarr[curr + 1]; laarr[curr + 1] = laarr[curr]; } else { laarr[curr + 3] = laarr[curr + 1]; laarr[curr + 2] = laarr[curr + 1] = laarr[curr]; } if (offset) { if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, &laarr[curr].extLocation, 0, offset); laarr[curr].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | (offset << blocksize_bits); laarr[curr].extLocation.logicalBlockNum = 0; laarr[curr].extLocation. partitionReferenceNum = 0; } else laarr[curr].extLength = (etype << 30) | (offset << blocksize_bits); curr++; (*c)++; (*endnum)++; } laarr[curr].extLocation.logicalBlockNum = newblocknum; if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) laarr[curr].extLocation.partitionReferenceNum = UDF_I(inode)->i_location.partitionReferenceNum; laarr[curr].extLength = EXT_RECORDED_ALLOCATED | blocksize; curr++; if (blen != offset + 1) { if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) laarr[curr].extLocation.logicalBlockNum += offset + 1; laarr[curr].extLength = (etype << 30) | ((blen - (offset + 1)) << blocksize_bits); curr++; (*endnum)++; } } } static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, struct kernel_long_ad *laarr, int *endnum) { int start, length = 0, currlength = 0, i; if (*endnum >= (c + 1)) { if (!lastblock) return; else start = c; } else { if ((laarr[c + 1].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { start = c + 1; length = currlength = (((laarr[c + 1].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); } else start = c; } for (i = start + 1; i <= *endnum; i++) { if (i == *endnum) { if (lastblock) length += UDF_DEFAULT_PREALLOC_BLOCKS; } else if ((laarr[i].extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) { length += (((laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); } else break; } if (length) { int next = laarr[start].extLocation.logicalBlockNum + (((laarr[start].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); int numalloc = udf_prealloc_blocks(inode->i_sb, inode, laarr[start].extLocation.partitionReferenceNum, next, (UDF_DEFAULT_PREALLOC_BLOCKS > length ? length : UDF_DEFAULT_PREALLOC_BLOCKS) - currlength); if (numalloc) { if (start == (c + 1)) laarr[start].extLength += (numalloc << inode->i_sb->s_blocksize_bits); else { memmove(&laarr[c + 2], &laarr[c + 1], sizeof(struct long_ad) * (*endnum - (c + 1))); (*endnum)++; laarr[c + 1].extLocation.logicalBlockNum = next; laarr[c + 1].extLocation.partitionReferenceNum = laarr[c].extLocation. partitionReferenceNum; laarr[c + 1].extLength = EXT_NOT_RECORDED_ALLOCATED | (numalloc << inode->i_sb->s_blocksize_bits); start = c + 1; } for (i = start + 1; numalloc && i < *endnum; i++) { int elen = ((laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (elen > numalloc) { laarr[i].extLength -= (numalloc << inode->i_sb->s_blocksize_bits); numalloc = 0; } else { numalloc -= elen; if (*endnum > (i + 1)) memmove(&laarr[i], &laarr[i + 1], sizeof(struct long_ad) * (*endnum - (i + 1))); i--; (*endnum)--; } } UDF_I(inode)->i_lenExtents += numalloc << inode->i_sb->s_blocksize_bits; } } } static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr, int *endnum) { int i; unsigned long blocksize = inode->i_sb->s_blocksize; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; for (i = 0; i < (*endnum - 1); i++) { struct kernel_long_ad *li /*l[i]*/ = &laarr[i]; struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1]; if (((li->extLength >> 30) == (lip1->extLength >> 30)) && (((li->extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) || ((lip1->extLocation.logicalBlockNum - li->extLocation.logicalBlockNum) == (((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits)))) { if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) <= UDF_EXTENT_LENGTH_MASK) { li->extLength = lip1->extLength + (((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) & ~(blocksize - 1)); if (*endnum > (i + 2)) memmove(&laarr[i + 1], &laarr[i + 2], sizeof(struct long_ad) * (*endnum - (i + 2))); i--; (*endnum)--; } } else if (((li->extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) && ((lip1->extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) { udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0, ((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits); li->extLocation.logicalBlockNum = 0; li->extLocation.partitionReferenceNum = 0; if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) { lip1->extLength = (lip1->extLength - (li->extLength & UDF_EXTENT_LENGTH_MASK) + UDF_EXTENT_LENGTH_MASK) & ~(blocksize - 1); li->extLength = (li->extLength & UDF_EXTENT_FLAG_MASK) + (UDF_EXTENT_LENGTH_MASK + 1) - blocksize; } else { li->extLength = lip1->extLength + (((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) & ~(blocksize - 1)); if (*endnum > (i + 2)) memmove(&laarr[i + 1], &laarr[i + 2], sizeof(struct long_ad) * (*endnum - (i + 2))); i--; (*endnum)--; } } else if ((li->extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0, ((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits); li->extLocation.logicalBlockNum = 0; li->extLocation.partitionReferenceNum = 0; li->extLength = (li->extLength & UDF_EXTENT_LENGTH_MASK) | EXT_NOT_RECORDED_NOT_ALLOCATED; } } } static int udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr, int startnum, int endnum, struct extent_position *epos) { int start = 0, i; struct kernel_lb_addr tmploc; uint32_t tmplen; int8_t tmpetype; int err; if (startnum > endnum) { for (i = 0; i < (startnum - endnum); i++) udf_delete_aext(inode, *epos); } else if (startnum < endnum) { for (i = 0; i < (endnum - startnum); i++) { err = udf_insert_aext(inode, *epos, laarr[i].extLocation, laarr[i].extLength); /* * If we fail here, we are likely corrupting the extent * list and leaking blocks. At least stop early to * limit the damage. */ if (err < 0) return err; err = udf_next_aext(inode, epos, &laarr[i].extLocation, &laarr[i].extLength, &tmpetype, 1); if (err < 0) return err; start++; } } for (i = start; i < endnum; i++) { err = udf_next_aext(inode, epos, &tmploc, &tmplen, &tmpetype, 0); if (err < 0) return err; udf_write_aext(inode, epos, &laarr[i].extLocation, laarr[i].extLength, 1); } return 0; } struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block, int create, int *err) { struct buffer_head *bh = NULL; struct udf_map_rq map = { .lblk = block, .iflags = UDF_MAP_NOPREALLOC | (create ? UDF_MAP_CREATE : 0), }; *err = udf_map_block(inode, &map); if (*err || !(map.oflags & UDF_BLK_MAPPED)) return NULL; bh = sb_getblk(inode->i_sb, map.pblk); if (!bh) { *err = -ENOMEM; return NULL; } if (map.oflags & UDF_BLK_NEW) { lock_buffer(bh); memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); unlock_buffer(bh); mark_buffer_dirty_inode(bh, inode); return bh; } if (bh_read(bh, 0) >= 0) return bh; brelse(bh); *err = -EIO; return NULL; } int udf_setsize(struct inode *inode, loff_t newsize) { int err = 0; struct udf_inode_info *iinfo; unsigned int bsize = i_blocksize(inode); if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) return -EINVAL; iinfo = UDF_I(inode); if (newsize > inode->i_size) { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { if (bsize >= (udf_file_entry_alloc_offset(inode) + newsize)) { down_write(&iinfo->i_data_sem); iinfo->i_lenAlloc = newsize; up_write(&iinfo->i_data_sem); goto set_size; } err = udf_expand_file_adinicb(inode); if (err) return err; } err = udf_extend_file(inode, newsize); if (err) return err; set_size: truncate_setsize(inode, newsize); } else { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); memset(iinfo->i_data + iinfo->i_lenEAttr + newsize, 0x00, bsize - newsize - udf_file_entry_alloc_offset(inode)); iinfo->i_lenAlloc = newsize; truncate_setsize(inode, newsize); up_write(&iinfo->i_data_sem); goto update_time; } err = block_truncate_page(inode->i_mapping, newsize, udf_get_block); if (err) return err; truncate_setsize(inode, newsize); down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); err = udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); if (err) return err; } update_time: inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (IS_SYNC(inode)) udf_sync_inode(inode); else mark_inode_dirty(inode); return err; } /* * Maximum length of linked list formed by ICB hierarchy. The chosen number is * arbitrary - just that we hopefully don't limit any real use of rewritten * inode on write-once media but avoid looping for too long on corrupted media. */ #define UDF_MAX_ICB_NESTING 1024 static int udf_read_inode(struct inode *inode, bool hidden_inode) { struct buffer_head *bh = NULL; struct fileEntry *fe; struct extendedFileEntry *efe; uint16_t ident; struct udf_inode_info *iinfo = UDF_I(inode); struct udf_sb_info *sbi = UDF_SB(inode->i_sb); struct kernel_lb_addr *iloc = &iinfo->i_location; unsigned int link_count; unsigned int indirections = 0; int bs = inode->i_sb->s_blocksize; int ret = -EIO; uint32_t uid, gid; struct timespec64 ts; reread: if (iloc->partitionReferenceNum >= sbi->s_partitions) { udf_debug("partition reference: %u > logical volume partitions: %u\n", iloc->partitionReferenceNum, sbi->s_partitions); return -EIO; } if (iloc->logicalBlockNum >= sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) { udf_debug("block=%u, partition=%u out of range\n", iloc->logicalBlockNum, iloc->partitionReferenceNum); return -EIO; } /* * Set defaults, but the inode is still incomplete! * Note: get_new_inode() sets the following on a new inode: * i_sb = sb * i_no = ino * i_flags = sb->s_flags * i_state = 0 * clean_inode(): zero fills and sets * i_count = 1 * i_nlink = 1 * i_op = NULL; */ bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident); if (!bh) { udf_err(inode->i_sb, "(ino %lu) failed !bh\n", inode->i_ino); return -EIO; } if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && ident != TAG_IDENT_USE) { udf_err(inode->i_sb, "(ino %lu) failed ident=%u\n", inode->i_ino, ident); goto out; } fe = (struct fileEntry *)bh->b_data; efe = (struct extendedFileEntry *)bh->b_data; if (fe->icbTag.strategyType == cpu_to_le16(4096)) { struct buffer_head *ibh; ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident); if (ident == TAG_IDENT_IE && ibh) { struct kernel_lb_addr loc; struct indirectEntry *ie; ie = (struct indirectEntry *)ibh->b_data; loc = lelb_to_cpu(ie->indirectICB.extLocation); if (ie->indirectICB.extLength) { brelse(ibh); memcpy(&iinfo->i_location, &loc, sizeof(struct kernel_lb_addr)); if (++indirections > UDF_MAX_ICB_NESTING) { udf_err(inode->i_sb, "too many ICBs in ICB hierarchy" " (max %d supported)\n", UDF_MAX_ICB_NESTING); goto out; } brelse(bh); goto reread; } } brelse(ibh); } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { udf_err(inode->i_sb, "unsupported strategy type: %u\n", le16_to_cpu(fe->icbTag.strategyType)); goto out; } if (fe->icbTag.strategyType == cpu_to_le16(4)) iinfo->i_strat4096 = 0; else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */ iinfo->i_strat4096 = 1; iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) & ICBTAG_FLAG_AD_MASK; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_SHORT && iinfo->i_alloc_type != ICBTAG_FLAG_AD_LONG && iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { ret = -EIO; goto out; } iinfo->i_hidden = hidden_inode; iinfo->i_unique = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenExtents = 0; iinfo->i_lenAlloc = 0; iinfo->i_next_alloc_block = 0; iinfo->i_next_alloc_goal = 0; if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) { iinfo->i_efe = 1; iinfo->i_use = 0; ret = udf_alloc_i_data(inode, bs - sizeof(struct extendedFileEntry)); if (ret) goto out; memcpy(iinfo->i_data, bh->b_data + sizeof(struct extendedFileEntry), bs - sizeof(struct extendedFileEntry)); } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { iinfo->i_efe = 0; iinfo->i_use = 0; ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry)); if (ret) goto out; memcpy(iinfo->i_data, bh->b_data + sizeof(struct fileEntry), bs - sizeof(struct fileEntry)); } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { iinfo->i_efe = 0; iinfo->i_use = 1; iinfo->i_lenAlloc = le32_to_cpu( ((struct unallocSpaceEntry *)bh->b_data)-> lengthAllocDescs); ret = udf_alloc_i_data(inode, bs - sizeof(struct unallocSpaceEntry)); if (ret) goto out; memcpy(iinfo->i_data, bh->b_data + sizeof(struct unallocSpaceEntry), bs - sizeof(struct unallocSpaceEntry)); return 0; } ret = -EIO; read_lock(&sbi->s_cred_lock); uid = le32_to_cpu(fe->uid); if (uid == UDF_INVALID_ID || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET)) inode->i_uid = sbi->s_uid; else i_uid_write(inode, uid); gid = le32_to_cpu(fe->gid); if (gid == UDF_INVALID_ID || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET)) inode->i_gid = sbi->s_gid; else i_gid_write(inode, gid); if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY && sbi->s_fmode != UDF_INVALID_MODE) inode->i_mode = sbi->s_fmode; else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY && sbi->s_dmode != UDF_INVALID_MODE) inode->i_mode = sbi->s_dmode; else inode->i_mode = udf_convert_permissions(fe); inode->i_mode &= ~sbi->s_umask; iinfo->i_extraPerms = le32_to_cpu(fe->permissions) & ~FE_MAPPED_PERMS; read_unlock(&sbi->s_cred_lock); link_count = le16_to_cpu(fe->fileLinkCount); if (!link_count) { if (!hidden_inode) { ret = -ESTALE; goto out; } link_count = 1; } set_nlink(inode, link_count); inode->i_size = le64_to_cpu(fe->informationLength); iinfo->i_lenExtents = inode->i_size; if (iinfo->i_efe == 0) { inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); udf_disk_stamp_to_time(&ts, fe->accessTime); inode_set_atime_to_ts(inode, ts); udf_disk_stamp_to_time(&ts, fe->modificationTime); inode_set_mtime_to_ts(inode, ts); udf_disk_stamp_to_time(&ts, fe->attrTime); inode_set_ctime_to_ts(inode, ts); iinfo->i_unique = le64_to_cpu(fe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs); iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint); iinfo->i_streamdir = 0; iinfo->i_lenStreams = 0; } else { inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); udf_disk_stamp_to_time(&ts, efe->accessTime); inode_set_atime_to_ts(inode, ts); udf_disk_stamp_to_time(&ts, efe->modificationTime); inode_set_mtime_to_ts(inode, ts); udf_disk_stamp_to_time(&ts, efe->attrTime); inode_set_ctime_to_ts(inode, ts); udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime); iinfo->i_unique = le64_to_cpu(efe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint); /* Named streams */ iinfo->i_streamdir = (efe->streamDirectoryICB.extLength != 0); iinfo->i_locStreamdir = lelb_to_cpu(efe->streamDirectoryICB.extLocation); iinfo->i_lenStreams = le64_to_cpu(efe->objectSize); if (iinfo->i_lenStreams >= inode->i_size) iinfo->i_lenStreams -= inode->i_size; else iinfo->i_lenStreams = 0; } inode->i_generation = iinfo->i_unique; /* * Sanity check length of allocation descriptors and extended attrs to * avoid integer overflows */ if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs) goto out; /* Now do exact checks */ if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs) goto out; /* Sanity checks for files in ICB so that we don't get confused later */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { /* * For file in ICB data is stored in allocation descriptor * so sizes should match */ if (iinfo->i_lenAlloc != inode->i_size) goto out; /* File in ICB has to fit in there... */ if (inode->i_size > bs - udf_file_entry_alloc_offset(inode)) goto out; } switch (fe->icbTag.fileType) { case ICBTAG_FILE_TYPE_DIRECTORY: inode->i_op = &udf_dir_inode_operations; inode->i_fop = &udf_dir_operations; inode->i_mode |= S_IFDIR; inc_nlink(inode); break; case ICBTAG_FILE_TYPE_REALTIME: case ICBTAG_FILE_TYPE_REGULAR: case ICBTAG_FILE_TYPE_UNDEF: case ICBTAG_FILE_TYPE_VAT20: inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; inode->i_mode |= S_IFREG; break; case ICBTAG_FILE_TYPE_BLOCK: inode->i_mode |= S_IFBLK; break; case ICBTAG_FILE_TYPE_CHAR: inode->i_mode |= S_IFCHR; break; case ICBTAG_FILE_TYPE_FIFO: init_special_inode(inode, inode->i_mode | S_IFIFO, 0); break; case ICBTAG_FILE_TYPE_SOCKET: init_special_inode(inode, inode->i_mode | S_IFSOCK, 0); break; case ICBTAG_FILE_TYPE_SYMLINK: inode->i_data.a_ops = &udf_symlink_aops; inode->i_op = &udf_symlink_inode_operations; inode_nohighmem(inode); inode->i_mode = S_IFLNK | 0777; break; case ICBTAG_FILE_TYPE_MAIN: udf_debug("METADATA FILE-----\n"); break; case ICBTAG_FILE_TYPE_MIRROR: udf_debug("METADATA MIRROR FILE-----\n"); break; case ICBTAG_FILE_TYPE_BITMAP: udf_debug("METADATA BITMAP FILE-----\n"); break; default: udf_err(inode->i_sb, "(ino %lu) failed unknown file type=%u\n", inode->i_ino, fe->icbTag.fileType); goto out; } if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { struct deviceSpec *dsea = (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); if (dsea) { init_special_inode(inode, inode->i_mode, MKDEV(le32_to_cpu(dsea->majorDeviceIdent), le32_to_cpu(dsea->minorDeviceIdent))); /* Developer ID ??? */ } else goto out; } ret = 0; out: brelse(bh); return ret; } static int udf_alloc_i_data(struct inode *inode, size_t size) { struct udf_inode_info *iinfo = UDF_I(inode); iinfo->i_data = kmalloc(size, GFP_KERNEL); if (!iinfo->i_data) return -ENOMEM; return 0; } static umode_t udf_convert_permissions(struct fileEntry *fe) { umode_t mode; uint32_t permissions; uint32_t flags; permissions = le32_to_cpu(fe->permissions); flags = le16_to_cpu(fe->icbTag.flags); mode = ((permissions) & 0007) | ((permissions >> 2) & 0070) | ((permissions >> 4) & 0700) | ((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) | ((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) | ((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0); return mode; } void udf_update_extra_perms(struct inode *inode, umode_t mode) { struct udf_inode_info *iinfo = UDF_I(inode); /* * UDF 2.01 sec. 3.3.3.3 Note 2: * In Unix, delete permission tracks write */ iinfo->i_extraPerms &= ~FE_DELETE_PERMS; if (mode & 0200) iinfo->i_extraPerms |= FE_PERM_U_DELETE; if (mode & 0020) iinfo->i_extraPerms |= FE_PERM_G_DELETE; if (mode & 0002) iinfo->i_extraPerms |= FE_PERM_O_DELETE; } int udf_write_inode(struct inode *inode, struct writeback_control *wbc) { return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } static int udf_sync_inode(struct inode *inode) { return udf_update_inode(inode, 1); } static void udf_adjust_time(struct udf_inode_info *iinfo, struct timespec64 time) { if (iinfo->i_crtime.tv_sec > time.tv_sec || (iinfo->i_crtime.tv_sec == time.tv_sec && iinfo->i_crtime.tv_nsec > time.tv_nsec)) iinfo->i_crtime = time; } static int udf_update_inode(struct inode *inode, int do_sync) { struct buffer_head *bh = NULL; struct fileEntry *fe; struct extendedFileEntry *efe; uint64_t lb_recorded; uint32_t udfperms; uint16_t icbflags; uint16_t crclen; int err = 0; struct udf_sb_info *sbi = UDF_SB(inode->i_sb); unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; struct udf_inode_info *iinfo = UDF_I(inode); bh = sb_getblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0)); if (!bh) { udf_debug("getblk failure\n"); return -EIO; } lock_buffer(bh); memset(bh->b_data, 0, inode->i_sb->s_blocksize); fe = (struct fileEntry *)bh->b_data; efe = (struct extendedFileEntry *)bh->b_data; if (iinfo->i_use) { struct unallocSpaceEntry *use = (struct unallocSpaceEntry *)bh->b_data; use->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); memcpy(bh->b_data + sizeof(struct unallocSpaceEntry), iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE); crclen = sizeof(struct unallocSpaceEntry); goto finish; } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) fe->uid = cpu_to_le32(UDF_INVALID_ID); else fe->uid = cpu_to_le32(i_uid_read(inode)); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET)) fe->gid = cpu_to_le32(UDF_INVALID_ID); else fe->gid = cpu_to_le32(i_gid_read(inode)); udfperms = ((inode->i_mode & 0007)) | ((inode->i_mode & 0070) << 2) | ((inode->i_mode & 0700) << 4); udfperms |= iinfo->i_extraPerms; fe->permissions = cpu_to_le32(udfperms); if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0) fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); else { if (iinfo->i_hidden) fe->fileLinkCount = cpu_to_le16(0); else fe->fileLinkCount = cpu_to_le16(inode->i_nlink); } fe->informationLength = cpu_to_le64(inode->i_size); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { struct regid *eid; struct deviceSpec *dsea = (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); if (!dsea) { dsea = (struct deviceSpec *) udf_add_extendedattr(inode, sizeof(struct deviceSpec) + sizeof(struct regid), 12, 0x3); dsea->attrType = cpu_to_le32(12); dsea->attrSubtype = 1; dsea->attrLength = cpu_to_le32( sizeof(struct deviceSpec) + sizeof(struct regid)); dsea->impUseLength = cpu_to_le32(sizeof(struct regid)); } eid = (struct regid *)dsea->impUse; memset(eid, 0, sizeof(*eid)); strcpy(eid->ident, UDF_ID_DEVELOPER); eid->identSuffix[0] = UDF_OS_CLASS_UNIX; eid->identSuffix[1] = UDF_OS_ID_LINUX; dsea->majorDeviceIdent = cpu_to_le32(imajor(inode)); dsea->minorDeviceIdent = cpu_to_le32(iminor(inode)); } if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) lb_recorded = 0; /* No extents => no blocks! */ else lb_recorded = (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> (blocksize_bits - 9); if (iinfo->i_efe == 0) { memcpy(bh->b_data + sizeof(struct fileEntry), iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct fileEntry)); fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); udf_time_to_disk_stamp(&fe->accessTime, inode_get_atime(inode)); udf_time_to_disk_stamp(&fe->modificationTime, inode_get_mtime(inode)); udf_time_to_disk_stamp(&fe->attrTime, inode_get_ctime(inode)); memset(&(fe->impIdent), 0, sizeof(struct regid)); strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; fe->uniqueID = cpu_to_le64(iinfo->i_unique); fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); fe->checkpoint = cpu_to_le32(iinfo->i_checkpoint); fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE); crclen = sizeof(struct fileEntry); } else { memcpy(bh->b_data + sizeof(struct extendedFileEntry), iinfo->i_data, inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)); efe->objectSize = cpu_to_le64(inode->i_size + iinfo->i_lenStreams); efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); if (iinfo->i_streamdir) { struct long_ad *icb_lad = &efe->streamDirectoryICB; icb_lad->extLocation = cpu_to_lelb(iinfo->i_locStreamdir); icb_lad->extLength = cpu_to_le32(inode->i_sb->s_blocksize); } udf_adjust_time(iinfo, inode_get_atime(inode)); udf_adjust_time(iinfo, inode_get_mtime(inode)); udf_adjust_time(iinfo, inode_get_ctime(inode)); udf_time_to_disk_stamp(&efe->accessTime, inode_get_atime(inode)); udf_time_to_disk_stamp(&efe->modificationTime, inode_get_mtime(inode)); udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); udf_time_to_disk_stamp(&efe->attrTime, inode_get_ctime(inode)); memset(&(efe->impIdent), 0, sizeof(efe->impIdent)); strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; efe->uniqueID = cpu_to_le64(iinfo->i_unique); efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); efe->checkpoint = cpu_to_le32(iinfo->i_checkpoint); efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE); crclen = sizeof(struct extendedFileEntry); } finish: if (iinfo->i_strat4096) { fe->icbTag.strategyType = cpu_to_le16(4096); fe->icbTag.strategyParameter = cpu_to_le16(1); fe->icbTag.numEntries = cpu_to_le16(2); } else { fe->icbTag.strategyType = cpu_to_le16(4); fe->icbTag.numEntries = cpu_to_le16(1); } if (iinfo->i_use) fe->icbTag.fileType = ICBTAG_FILE_TYPE_USE; else if (S_ISDIR(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY; else if (S_ISREG(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR; else if (S_ISLNK(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_SYMLINK; else if (S_ISBLK(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_BLOCK; else if (S_ISCHR(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_CHAR; else if (S_ISFIFO(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_FIFO; else if (S_ISSOCK(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_SOCKET; icbflags = iinfo->i_alloc_type | ((inode->i_mode & S_ISUID) ? ICBTAG_FLAG_SETUID : 0) | ((inode->i_mode & S_ISGID) ? ICBTAG_FLAG_SETGID : 0) | ((inode->i_mode & S_ISVTX) ? ICBTAG_FLAG_STICKY : 0) | (le16_to_cpu(fe->icbTag.flags) & ~(ICBTAG_FLAG_AD_MASK | ICBTAG_FLAG_SETUID | ICBTAG_FLAG_SETGID | ICBTAG_FLAG_STICKY)); fe->icbTag.flags = cpu_to_le16(icbflags); if (sbi->s_udfrev >= 0x0200) fe->descTag.descVersion = cpu_to_le16(3); else fe->descTag.descVersion = cpu_to_le16(2); fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number); fe->descTag.tagLocation = cpu_to_le32( iinfo->i_location.logicalBlockNum); crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag); fe->descTag.descCRCLength = cpu_to_le16(crclen); fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag), crclen)); fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); set_buffer_uptodate(bh); unlock_buffer(bh); /* write the data blocks */ mark_buffer_dirty(bh); if (do_sync) { sync_dirty_buffer(bh); if (buffer_write_io_error(bh)) { udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n", inode->i_ino); err = -EIO; } } brelse(bh); return err; } struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, bool hidden_inode) { unsigned long block = udf_get_lb_pblock(sb, ino, 0); struct inode *inode = iget_locked(sb, block); int err; if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { if (UDF_I(inode)->i_hidden != hidden_inode) { iput(inode); return ERR_PTR(-EFSCORRUPTED); } return inode; } memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); err = udf_read_inode(inode, hidden_inode); if (err < 0) { iget_failed(inode); return ERR_PTR(err); } unlock_new_inode(inode); return inode; } int udf_setup_indirect_aext(struct inode *inode, udf_pblk_t block, struct extent_position *epos) { struct super_block *sb = inode->i_sb; struct buffer_head *bh; struct allocExtDesc *aed; struct extent_position nepos; struct kernel_lb_addr neloc; int ver, adsize; int err = 0; if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else return -EIO; neloc.logicalBlockNum = block; neloc.partitionReferenceNum = epos->block.partitionReferenceNum; bh = sb_getblk(sb, udf_get_lb_pblock(sb, &neloc, 0)); if (!bh) return -EIO; lock_buffer(bh); memset(bh->b_data, 0x00, sb->s_blocksize); set_buffer_uptodate(bh); unlock_buffer(bh); mark_buffer_dirty_inode(bh, inode); aed = (struct allocExtDesc *)(bh->b_data); if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) { aed->previousAllocExtLocation = cpu_to_le32(epos->block.logicalBlockNum); } aed->lengthAllocDescs = cpu_to_le32(0); if (UDF_SB(sb)->s_udfrev >= 0x0200) ver = 3; else ver = 2; udf_new_tag(bh->b_data, TAG_IDENT_AED, ver, 1, block, sizeof(struct tag)); nepos.block = neloc; nepos.offset = sizeof(struct allocExtDesc); nepos.bh = bh; /* * Do we have to copy current last extent to make space for indirect * one? */ if (epos->offset + adsize > sb->s_blocksize) { struct kernel_lb_addr cp_loc; uint32_t cp_len; int8_t cp_type; epos->offset -= adsize; err = udf_current_aext(inode, epos, &cp_loc, &cp_len, &cp_type, 0); if (err <= 0) goto err_out; cp_len |= ((uint32_t)cp_type) << 30; __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1); udf_write_aext(inode, epos, &nepos.block, sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0); } else { __udf_add_aext(inode, epos, &nepos.block, sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDESCS, 0); } brelse(epos->bh); *epos = nepos; return 0; err_out: brelse(bh); return err; } /* * Append extent at the given position - should be the first free one in inode * / indirect extent. This function assumes there is enough space in the inode * or indirect extent. Use udf_add_aext() if you didn't check for this before. */ int __udf_add_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { struct udf_inode_info *iinfo = UDF_I(inode); struct allocExtDesc *aed; int adsize; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else return -EIO; if (!epos->bh) { WARN_ON(iinfo->i_lenAlloc != epos->offset - udf_file_entry_alloc_offset(inode)); } else { aed = (struct allocExtDesc *)epos->bh->b_data; WARN_ON(le32_to_cpu(aed->lengthAllocDescs) != epos->offset - sizeof(struct allocExtDesc)); WARN_ON(epos->offset + adsize > inode->i_sb->s_blocksize); } udf_write_aext(inode, epos, eloc, elen, inc); if (!epos->bh) { iinfo->i_lenAlloc += adsize; mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)epos->bh->b_data; le32_add_cpu(&aed->lengthAllocDescs, adsize); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(epos->bh->b_data, epos->offset + (inc ? 0 : adsize)); else udf_update_tag(epos->bh->b_data, sizeof(struct allocExtDesc)); mark_buffer_dirty_inode(epos->bh, inode); } return 0; } /* * Append extent at given position - should be the first free one in inode * / indirect extent. Takes care of allocating and linking indirect blocks. */ int udf_add_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; struct super_block *sb = inode->i_sb; if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else return -EIO; if (epos->offset + (2 * adsize) > sb->s_blocksize) { int err; udf_pblk_t new_block; new_block = udf_new_block(sb, NULL, epos->block.partitionReferenceNum, epos->block.logicalBlockNum, &err); if (!new_block) return -ENOSPC; err = udf_setup_indirect_aext(inode, new_block, epos); if (err) return err; } return __udf_add_aext(inode, epos, eloc, elen, inc); } void udf_write_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; uint8_t *ptr; struct short_ad *sad; struct long_ad *lad; struct udf_inode_info *iinfo = UDF_I(inode); if (!epos->bh) ptr = iinfo->i_data + epos->offset - udf_file_entry_alloc_offset(inode) + iinfo->i_lenEAttr; else ptr = epos->bh->b_data + epos->offset; switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: sad = (struct short_ad *)ptr; sad->extLength = cpu_to_le32(elen); sad->extPosition = cpu_to_le32(eloc->logicalBlockNum); adsize = sizeof(struct short_ad); break; case ICBTAG_FLAG_AD_LONG: lad = (struct long_ad *)ptr; lad->extLength = cpu_to_le32(elen); lad->extLocation = cpu_to_lelb(*eloc); memset(lad->impUse, 0x00, sizeof(lad->impUse)); adsize = sizeof(struct long_ad); break; default: return; } if (epos->bh) { if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) { struct allocExtDesc *aed = (struct allocExtDesc *)epos->bh->b_data; udf_update_tag(epos->bh->b_data, le32_to_cpu(aed->lengthAllocDescs) + sizeof(struct allocExtDesc)); } mark_buffer_dirty_inode(epos->bh, inode); } else { mark_inode_dirty(inode); } if (inc) epos->offset += adsize; } /* * Only 1 indirect extent in a row really makes sense but allow upto 16 in case * someone does some weird stuff. */ #define UDF_MAX_INDIR_EXTS 16 /* * Returns 1 on success, -errno on error, 0 on hit EOF. */ int udf_next_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t *elen, int8_t *etype, int inc) { unsigned int indirections = 0; int ret = 0; udf_pblk_t block; while (1) { ret = udf_current_aext(inode, epos, eloc, elen, etype, inc); if (ret <= 0) return ret; if (*etype != (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) return ret; if (++indirections > UDF_MAX_INDIR_EXTS) { udf_err(inode->i_sb, "too many indirect extents in inode %lu\n", inode->i_ino); return -EFSCORRUPTED; } epos->block = *eloc; epos->offset = sizeof(struct allocExtDesc); brelse(epos->bh); block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0); epos->bh = sb_bread(inode->i_sb, block); if (!epos->bh) { udf_debug("reading block %u failed!\n", block); return -EIO; } } } /* * Returns 1 on success, -errno on error, 0 on hit EOF. */ int udf_current_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t *elen, int8_t *etype, int inc) { int alen; uint8_t *ptr; struct short_ad *sad; struct long_ad *lad; struct udf_inode_info *iinfo = UDF_I(inode); if (!epos->bh) { if (!epos->offset) epos->offset = udf_file_entry_alloc_offset(inode); ptr = iinfo->i_data + epos->offset - udf_file_entry_alloc_offset(inode) + iinfo->i_lenEAttr; alen = udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc; } else { struct allocExtDesc *header = (struct allocExtDesc *)epos->bh->b_data; if (!epos->offset) epos->offset = sizeof(struct allocExtDesc); ptr = epos->bh->b_data + epos->offset; if (check_add_overflow(sizeof(struct allocExtDesc), le32_to_cpu(header->lengthAllocDescs), &alen)) return -1; } switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: sad = udf_get_fileshortad(ptr, alen, &epos->offset, inc); if (!sad) return 0; *etype = le32_to_cpu(sad->extLength) >> 30; eloc->logicalBlockNum = le32_to_cpu(sad->extPosition); eloc->partitionReferenceNum = iinfo->i_location.partitionReferenceNum; *elen = le32_to_cpu(sad->extLength) & UDF_EXTENT_LENGTH_MASK; break; case ICBTAG_FLAG_AD_LONG: lad = udf_get_filelongad(ptr, alen, &epos->offset, inc); if (!lad) return 0; *etype = le32_to_cpu(lad->extLength) >> 30; *eloc = lelb_to_cpu(lad->extLocation); *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK; break; default: udf_debug("alloc_type = %u unsupported\n", iinfo->i_alloc_type); return -EINVAL; } return 1; } static int udf_insert_aext(struct inode *inode, struct extent_position epos, struct kernel_lb_addr neloc, uint32_t nelen) { struct kernel_lb_addr oeloc; uint32_t oelen; int8_t etype; int ret; if (epos.bh) get_bh(epos.bh); while (1) { ret = udf_next_aext(inode, &epos, &oeloc, &oelen, &etype, 0); if (ret <= 0) break; udf_write_aext(inode, &epos, &neloc, nelen, 1); neloc = oeloc; nelen = (etype << 30) | oelen; } if (ret == 0) ret = udf_add_aext(inode, &epos, &neloc, nelen, 1); brelse(epos.bh); return ret; } int8_t udf_delete_aext(struct inode *inode, struct extent_position epos) { struct extent_position oepos; int adsize; int8_t etype; struct allocExtDesc *aed; struct udf_inode_info *iinfo; struct kernel_lb_addr eloc; uint32_t elen; int ret; if (epos.bh) { get_bh(epos.bh); get_bh(epos.bh); } iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else adsize = 0; oepos = epos; if (udf_next_aext(inode, &epos, &eloc, &elen, &etype, 1) <= 0) return -1; while (1) { ret = udf_next_aext(inode, &epos, &eloc, &elen, &etype, 1); if (ret < 0) { brelse(epos.bh); brelse(oepos.bh); return -1; } if (ret == 0) break; udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1); if (oepos.bh != epos.bh) { oepos.block = epos.block; brelse(oepos.bh); get_bh(epos.bh); oepos.bh = epos.bh; oepos.offset = epos.offset - adsize; } } memset(&eloc, 0x00, sizeof(struct kernel_lb_addr)); elen = 0; if (epos.bh != oepos.bh) { udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1); udf_write_aext(inode, &oepos, &eloc, elen, 1); udf_write_aext(inode, &oepos, &eloc, elen, 1); if (!oepos.bh) { iinfo->i_lenAlloc -= (adsize * 2); mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)oepos.bh->b_data; le32_add_cpu(&aed->lengthAllocDescs, -(2 * adsize)); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(oepos.bh->b_data, oepos.offset - (2 * adsize)); else udf_update_tag(oepos.bh->b_data, sizeof(struct allocExtDesc)); mark_buffer_dirty_inode(oepos.bh, inode); } } else { udf_write_aext(inode, &oepos, &eloc, elen, 1); if (!oepos.bh) { iinfo->i_lenAlloc -= adsize; mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)oepos.bh->b_data; le32_add_cpu(&aed->lengthAllocDescs, -adsize); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(oepos.bh->b_data, epos.offset - adsize); else udf_update_tag(oepos.bh->b_data, sizeof(struct allocExtDesc)); mark_buffer_dirty_inode(oepos.bh, inode); } } brelse(epos.bh); brelse(oepos.bh); return (elen >> 30); } /* * Returns 1 on success, -errno on error, 0 on hit EOF. */ int inode_bmap(struct inode *inode, sector_t block, struct extent_position *pos, struct kernel_lb_addr *eloc, uint32_t *elen, sector_t *offset, int8_t *etype) { unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; loff_t lbcount = 0, bcount = (loff_t) block << blocksize_bits; struct udf_inode_info *iinfo; int err = 0; iinfo = UDF_I(inode); if (!udf_read_extent_cache(inode, bcount, &lbcount, pos)) { pos->offset = 0; pos->block = iinfo->i_location; pos->bh = NULL; } *elen = 0; do { err = udf_next_aext(inode, pos, eloc, elen, etype, 1); if (err <= 0) { if (err == 0) { *offset = (bcount - lbcount) >> blocksize_bits; iinfo->i_lenExtents = lbcount; } return err; } lbcount += *elen; } while (lbcount <= bcount); /* update extent cache */ udf_update_extent_cache(inode, lbcount - *elen, pos); *offset = (bcount + *elen - lbcount) >> blocksize_bits; return 1; }
7 7 4 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 // SPDX-License-Identifier: GPL-2.0-or-later /* * ChaCha and XChaCha stream ciphers, including ChaCha20 (RFC7539) * * Copyright (C) 2015 Martin Willi * Copyright (C) 2018 Google LLC */ #include <linux/unaligned.h> #include <crypto/algapi.h> #include <crypto/internal/chacha.h> #include <crypto/internal/skcipher.h> #include <linux/module.h> static int chacha_stream_xor(struct skcipher_request *req, const struct chacha_ctx *ctx, const u8 *iv) { struct skcipher_walk walk; u32 state[16]; int err; err = skcipher_walk_virt(&walk, req, false); chacha_init_generic(state, ctx->key, iv); while (walk.nbytes > 0) { unsigned int nbytes = walk.nbytes; if (nbytes < walk.total) nbytes = round_down(nbytes, CHACHA_BLOCK_SIZE); chacha_crypt_generic(state, walk.dst.virt.addr, walk.src.virt.addr, nbytes, ctx->nrounds); err = skcipher_walk_done(&walk, walk.nbytes - nbytes); } return err; } static int crypto_chacha_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); return chacha_stream_xor(req, ctx, req->iv); } static int crypto_xchacha_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); struct chacha_ctx subctx; u32 state[16]; u8 real_iv[16]; /* Compute the subkey given the original key and first 128 nonce bits */ chacha_init_generic(state, ctx->key, req->iv); hchacha_block_generic(state, subctx.key, ctx->nrounds); subctx.nrounds = ctx->nrounds; /* Build the real IV */ memcpy(&real_iv[0], req->iv + 24, 8); /* stream position */ memcpy(&real_iv[8], req->iv + 16, 8); /* remaining 64 nonce bits */ /* Generate the stream and XOR it with the data */ return chacha_stream_xor(req, &subctx, real_iv); } static struct skcipher_alg algs[] = { { .base.cra_name = "chacha20", .base.cra_driver_name = "chacha20-generic", .base.cra_priority = 100, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = CHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, .encrypt = crypto_chacha_crypt, .decrypt = crypto_chacha_crypt, }, { .base.cra_name = "xchacha20", .base.cra_driver_name = "xchacha20-generic", .base.cra_priority = 100, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha20_setkey, .encrypt = crypto_xchacha_crypt, .decrypt = crypto_xchacha_crypt, }, { .base.cra_name = "xchacha12", .base.cra_driver_name = "xchacha12-generic", .base.cra_priority = 100, .base.cra_blocksize = 1, .base.cra_ctxsize = sizeof(struct chacha_ctx), .base.cra_module = THIS_MODULE, .min_keysize = CHACHA_KEY_SIZE, .max_keysize = CHACHA_KEY_SIZE, .ivsize = XCHACHA_IV_SIZE, .chunksize = CHACHA_BLOCK_SIZE, .setkey = chacha12_setkey, .encrypt = crypto_xchacha_crypt, .decrypt = crypto_xchacha_crypt, } }; static int __init chacha_generic_mod_init(void) { return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); } static void __exit chacha_generic_mod_fini(void) { crypto_unregister_skciphers(algs, ARRAY_SIZE(algs)); } subsys_initcall(chacha_generic_mod_init); module_exit(chacha_generic_mod_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Martin Willi <martin@strongswan.org>"); MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (generic)"); MODULE_ALIAS_CRYPTO("chacha20"); MODULE_ALIAS_CRYPTO("chacha20-generic"); MODULE_ALIAS_CRYPTO("xchacha20"); MODULE_ALIAS_CRYPTO("xchacha20-generic"); MODULE_ALIAS_CRYPTO("xchacha12"); MODULE_ALIAS_CRYPTO("xchacha12-generic");
11 1 10 7 10 10 8 8 8 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 // SPDX-License-Identifier: GPL-2.0 /* * fs-verity hash algorithms * * Copyright 2019 Google LLC */ #include "fsverity_private.h" #include <crypto/hash.h> /* The hash algorithms supported by fs-verity */ struct fsverity_hash_alg fsverity_hash_algs[] = { [FS_VERITY_HASH_ALG_SHA256] = { .name = "sha256", .digest_size = SHA256_DIGEST_SIZE, .block_size = SHA256_BLOCK_SIZE, .algo_id = HASH_ALGO_SHA256, }, [FS_VERITY_HASH_ALG_SHA512] = { .name = "sha512", .digest_size = SHA512_DIGEST_SIZE, .block_size = SHA512_BLOCK_SIZE, .algo_id = HASH_ALGO_SHA512, }, }; static DEFINE_MUTEX(fsverity_hash_alg_init_mutex); /** * fsverity_get_hash_alg() - validate and prepare a hash algorithm * @inode: optional inode for logging purposes * @num: the hash algorithm number * * Get the struct fsverity_hash_alg for the given hash algorithm number, and * ensure it has a hash transform ready to go. The hash transforms are * allocated on-demand so that we don't waste resources unnecessarily, and * because the crypto modules may be initialized later than fs/verity/. * * Return: pointer to the hash alg on success, else an ERR_PTR() */ const struct fsverity_hash_alg *fsverity_get_hash_alg(const struct inode *inode, unsigned int num) { struct fsverity_hash_alg *alg; struct crypto_shash *tfm; int err; if (num >= ARRAY_SIZE(fsverity_hash_algs) || !fsverity_hash_algs[num].name) { fsverity_warn(inode, "Unknown hash algorithm number: %u", num); return ERR_PTR(-EINVAL); } alg = &fsverity_hash_algs[num]; /* pairs with smp_store_release() below */ if (likely(smp_load_acquire(&alg->tfm) != NULL)) return alg; mutex_lock(&fsverity_hash_alg_init_mutex); if (alg->tfm != NULL) goto out_unlock; tfm = crypto_alloc_shash(alg->name, 0, 0); if (IS_ERR(tfm)) { if (PTR_ERR(tfm) == -ENOENT) { fsverity_warn(inode, "Missing crypto API support for hash algorithm \"%s\"", alg->name); alg = ERR_PTR(-ENOPKG); goto out_unlock; } fsverity_err(inode, "Error allocating hash algorithm \"%s\": %ld", alg->name, PTR_ERR(tfm)); alg = ERR_CAST(tfm); goto out_unlock; } err = -EINVAL; if (WARN_ON_ONCE(alg->digest_size != crypto_shash_digestsize(tfm))) goto err_free_tfm; if (WARN_ON_ONCE(alg->block_size != crypto_shash_blocksize(tfm))) goto err_free_tfm; pr_info("%s using implementation \"%s\"\n", alg->name, crypto_shash_driver_name(tfm)); /* pairs with smp_load_acquire() above */ smp_store_release(&alg->tfm, tfm); goto out_unlock; err_free_tfm: crypto_free_shash(tfm); alg = ERR_PTR(err); out_unlock: mutex_unlock(&fsverity_hash_alg_init_mutex); return alg; } /** * fsverity_prepare_hash_state() - precompute the initial hash state * @alg: hash algorithm * @salt: a salt which is to be prepended to all data to be hashed * @salt_size: salt size in bytes, possibly 0 * * Return: NULL if the salt is empty, otherwise the kmalloc()'ed precomputed * initial hash state on success or an ERR_PTR() on failure. */ const u8 *fsverity_prepare_hash_state(const struct fsverity_hash_alg *alg, const u8 *salt, size_t salt_size) { u8 *hashstate = NULL; SHASH_DESC_ON_STACK(desc, alg->tfm); u8 *padded_salt = NULL; size_t padded_salt_size; int err; desc->tfm = alg->tfm; if (salt_size == 0) return NULL; hashstate = kmalloc(crypto_shash_statesize(alg->tfm), GFP_KERNEL); if (!hashstate) return ERR_PTR(-ENOMEM); /* * Zero-pad the salt to the next multiple of the input size of the hash * algorithm's compression function, e.g. 64 bytes for SHA-256 or 128 * bytes for SHA-512. This ensures that the hash algorithm won't have * any bytes buffered internally after processing the salt, thus making * salted hashing just as fast as unsalted hashing. */ padded_salt_size = round_up(salt_size, alg->block_size); padded_salt = kzalloc(padded_salt_size, GFP_KERNEL); if (!padded_salt) { err = -ENOMEM; goto err_free; } memcpy(padded_salt, salt, salt_size); err = crypto_shash_init(desc); if (err) goto err_free; err = crypto_shash_update(desc, padded_salt, padded_salt_size); if (err) goto err_free; err = crypto_shash_export(desc, hashstate); if (err) goto err_free; out: kfree(padded_salt); return hashstate; err_free: kfree(hashstate); hashstate = ERR_PTR(err); goto out; } /** * fsverity_hash_block() - hash a single data or hash block * @params: the Merkle tree's parameters * @inode: inode for which the hashing is being done * @data: virtual address of a buffer containing the block to hash * @out: output digest, size 'params->digest_size' bytes * * Hash a single data or hash block. The hash is salted if a salt is specified * in the Merkle tree parameters. * * Return: 0 on success, -errno on failure */ int fsverity_hash_block(const struct merkle_tree_params *params, const struct inode *inode, const void *data, u8 *out) { SHASH_DESC_ON_STACK(desc, params->hash_alg->tfm); int err; desc->tfm = params->hash_alg->tfm; if (params->hashstate) { err = crypto_shash_import(desc, params->hashstate); if (err) { fsverity_err(inode, "Error %d importing hash state", err); return err; } err = crypto_shash_finup(desc, data, params->block_size, out); } else { err = crypto_shash_digest(desc, data, params->block_size, out); } if (err) fsverity_err(inode, "Error %d computing block hash", err); return err; } /** * fsverity_hash_buffer() - hash some data * @alg: the hash algorithm to use * @data: the data to hash * @size: size of data to hash, in bytes * @out: output digest, size 'alg->digest_size' bytes * * Return: 0 on success, -errno on failure */ int fsverity_hash_buffer(const struct fsverity_hash_alg *alg, const void *data, size_t size, u8 *out) { return crypto_shash_tfm_digest(alg->tfm, data, size, out); } void __init fsverity_check_hash_algs(void) { size_t i; /* * Sanity check the hash algorithms (could be a build-time check, but * they're in an array) */ for (i = 0; i < ARRAY_SIZE(fsverity_hash_algs); i++) { const struct fsverity_hash_alg *alg = &fsverity_hash_algs[i]; if (!alg->name) continue; /* * 0 must never be allocated as an FS_VERITY_HASH_ALG_* value, * as it is reserved for users that use 0 to mean unspecified or * a default value. fs/verity/ itself doesn't care and doesn't * have a default algorithm, but some users make use of this. */ BUG_ON(i == 0); BUG_ON(alg->digest_size > FS_VERITY_MAX_DIGEST_SIZE); /* * For efficiency, the implementation currently assumes the * digest and block sizes are powers of 2. This limitation can * be lifted if the code is updated to handle other values. */ BUG_ON(!is_power_of_2(alg->digest_size)); BUG_ON(!is_power_of_2(alg->block_size)); /* Verify that there is a valid mapping to HASH_ALGO_*. */ BUG_ON(alg->algo_id == 0); BUG_ON(alg->digest_size != hash_digest_size[alg->algo_id]); } }
7 6 6 5 1 4 2 1 1 1 24 1 19 5 19 5 1 1 3 4 2 1 1 5 3 5 4 2 2 2 1 2 2 1 1 2 2 1 3 3 3 3 11 2 2 7 2 3 4 26 4 24 1 1 4 4 5 4 3 2 2 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 // SPDX-License-Identifier: GPL-2.0-or-later /* * User-space I/O driver support for HID subsystem * Copyright (c) 2012 David Herrmann */ /* */ #include <linux/atomic.h> #include <linux/compat.h> #include <linux/cred.h> #include <linux/device.h> #include <linux/fs.h> #include <linux/hid.h> #include <linux/input.h> #include <linux/miscdevice.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/uhid.h> #include <linux/wait.h> #define UHID_NAME "uhid" #define UHID_BUFSIZE 32 struct uhid_device { struct mutex devlock; /* This flag tracks whether the HID device is usable for commands from * userspace. The flag is already set before hid_add_device(), which * runs in workqueue context, to allow hid_add_device() to communicate * with userspace. * However, if hid_add_device() fails, the flag is cleared without * holding devlock. * We guarantee that if @running changes from true to false while you're * holding @devlock, it's still fine to access @hid. */ bool running; __u8 *rd_data; uint rd_size; /* When this is NULL, userspace may use UHID_CREATE/UHID_CREATE2. */ struct hid_device *hid; struct uhid_event input_buf; wait_queue_head_t waitq; spinlock_t qlock; __u8 head; __u8 tail; struct uhid_event *outq[UHID_BUFSIZE]; /* blocking GET_REPORT support; state changes protected by qlock */ struct mutex report_lock; wait_queue_head_t report_wait; bool report_running; u32 report_id; u32 report_type; struct uhid_event report_buf; struct work_struct worker; }; static struct miscdevice uhid_misc; static void uhid_device_add_worker(struct work_struct *work) { struct uhid_device *uhid = container_of(work, struct uhid_device, worker); int ret; ret = hid_add_device(uhid->hid); if (ret) { hid_err(uhid->hid, "Cannot register HID device: error %d\n", ret); /* We used to call hid_destroy_device() here, but that's really * messy to get right because we have to coordinate with * concurrent writes from userspace that might be in the middle * of using uhid->hid. * Just leave uhid->hid as-is for now, and clean it up when * userspace tries to close or reinitialize the uhid instance. * * However, we do have to clear the ->running flag and do a * wakeup to make sure userspace knows that the device is gone. */ WRITE_ONCE(uhid->running, false); wake_up_interruptible(&uhid->report_wait); } } static void uhid_queue(struct uhid_device *uhid, struct uhid_event *ev) { __u8 newhead; newhead = (uhid->head + 1) % UHID_BUFSIZE; if (newhead != uhid->tail) { uhid->outq[uhid->head] = ev; uhid->head = newhead; wake_up_interruptible(&uhid->waitq); } else { hid_warn(uhid->hid, "Output queue is full\n"); kfree(ev); } } static int uhid_queue_event(struct uhid_device *uhid, __u32 event) { unsigned long flags; struct uhid_event *ev; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) return -ENOMEM; ev->type = event; spin_lock_irqsave(&uhid->qlock, flags); uhid_queue(uhid, ev); spin_unlock_irqrestore(&uhid->qlock, flags); return 0; } static int uhid_hid_start(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; struct uhid_event *ev; unsigned long flags; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) return -ENOMEM; ev->type = UHID_START; if (hid->report_enum[HID_FEATURE_REPORT].numbered) ev->u.start.dev_flags |= UHID_DEV_NUMBERED_FEATURE_REPORTS; if (hid->report_enum[HID_OUTPUT_REPORT].numbered) ev->u.start.dev_flags |= UHID_DEV_NUMBERED_OUTPUT_REPORTS; if (hid->report_enum[HID_INPUT_REPORT].numbered) ev->u.start.dev_flags |= UHID_DEV_NUMBERED_INPUT_REPORTS; spin_lock_irqsave(&uhid->qlock, flags); uhid_queue(uhid, ev); spin_unlock_irqrestore(&uhid->qlock, flags); return 0; } static void uhid_hid_stop(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; hid->claimed = 0; uhid_queue_event(uhid, UHID_STOP); } static int uhid_hid_open(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; return uhid_queue_event(uhid, UHID_OPEN); } static void uhid_hid_close(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; uhid_queue_event(uhid, UHID_CLOSE); } static int uhid_hid_parse(struct hid_device *hid) { struct uhid_device *uhid = hid->driver_data; return hid_parse_report(hid, uhid->rd_data, uhid->rd_size); } /* must be called with report_lock held */ static int __uhid_report_queue_and_wait(struct uhid_device *uhid, struct uhid_event *ev, __u32 *report_id) { unsigned long flags; int ret; spin_lock_irqsave(&uhid->qlock, flags); *report_id = ++uhid->report_id; uhid->report_type = ev->type + 1; uhid->report_running = true; uhid_queue(uhid, ev); spin_unlock_irqrestore(&uhid->qlock, flags); ret = wait_event_interruptible_timeout(uhid->report_wait, !uhid->report_running || !READ_ONCE(uhid->running), 5 * HZ); if (!ret || !READ_ONCE(uhid->running) || uhid->report_running) ret = -EIO; else if (ret < 0) ret = -ERESTARTSYS; else ret = 0; uhid->report_running = false; return ret; } static void uhid_report_wake_up(struct uhid_device *uhid, u32 id, const struct uhid_event *ev) { unsigned long flags; spin_lock_irqsave(&uhid->qlock, flags); /* id for old report; drop it silently */ if (uhid->report_type != ev->type || uhid->report_id != id) goto unlock; if (!uhid->report_running) goto unlock; memcpy(&uhid->report_buf, ev, sizeof(*ev)); uhid->report_running = false; wake_up_interruptible(&uhid->report_wait); unlock: spin_unlock_irqrestore(&uhid->qlock, flags); } static int uhid_hid_get_report(struct hid_device *hid, unsigned char rnum, u8 *buf, size_t count, u8 rtype) { struct uhid_device *uhid = hid->driver_data; struct uhid_get_report_reply_req *req; struct uhid_event *ev; int ret; if (!READ_ONCE(uhid->running)) return -EIO; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) return -ENOMEM; ev->type = UHID_GET_REPORT; ev->u.get_report.rnum = rnum; ev->u.get_report.rtype = rtype; ret = mutex_lock_interruptible(&uhid->report_lock); if (ret) { kfree(ev); return ret; } /* this _always_ takes ownership of @ev */ ret = __uhid_report_queue_and_wait(uhid, ev, &ev->u.get_report.id); if (ret) goto unlock; req = &uhid->report_buf.u.get_report_reply; if (req->err) { ret = -EIO; } else { ret = min3(count, (size_t)req->size, (size_t)UHID_DATA_MAX); memcpy(buf, req->data, ret); } unlock: mutex_unlock(&uhid->report_lock); return ret; } static int uhid_hid_set_report(struct hid_device *hid, unsigned char rnum, const u8 *buf, size_t count, u8 rtype) { struct uhid_device *uhid = hid->driver_data; struct uhid_event *ev; int ret; if (!READ_ONCE(uhid->running) || count > UHID_DATA_MAX) return -EIO; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) return -ENOMEM; ev->type = UHID_SET_REPORT; ev->u.set_report.rnum = rnum; ev->u.set_report.rtype = rtype; ev->u.set_report.size = count; memcpy(ev->u.set_report.data, buf, count); ret = mutex_lock_interruptible(&uhid->report_lock); if (ret) { kfree(ev); return ret; } /* this _always_ takes ownership of @ev */ ret = __uhid_report_queue_and_wait(uhid, ev, &ev->u.set_report.id); if (ret) goto unlock; if (uhid->report_buf.u.set_report_reply.err) ret = -EIO; else ret = count; unlock: mutex_unlock(&uhid->report_lock); return ret; } static int uhid_hid_raw_request(struct hid_device *hid, unsigned char reportnum, __u8 *buf, size_t len, unsigned char rtype, int reqtype) { u8 u_rtype; switch (rtype) { case HID_FEATURE_REPORT: u_rtype = UHID_FEATURE_REPORT; break; case HID_OUTPUT_REPORT: u_rtype = UHID_OUTPUT_REPORT; break; case HID_INPUT_REPORT: u_rtype = UHID_INPUT_REPORT; break; default: return -EINVAL; } switch (reqtype) { case HID_REQ_GET_REPORT: return uhid_hid_get_report(hid, reportnum, buf, len, u_rtype); case HID_REQ_SET_REPORT: return uhid_hid_set_report(hid, reportnum, buf, len, u_rtype); default: return -EIO; } } static int uhid_hid_output_raw(struct hid_device *hid, __u8 *buf, size_t count, unsigned char report_type) { struct uhid_device *uhid = hid->driver_data; __u8 rtype; unsigned long flags; struct uhid_event *ev; switch (report_type) { case HID_FEATURE_REPORT: rtype = UHID_FEATURE_REPORT; break; case HID_OUTPUT_REPORT: rtype = UHID_OUTPUT_REPORT; break; default: return -EINVAL; } if (count < 1 || count > UHID_DATA_MAX) return -EINVAL; ev = kzalloc(sizeof(*ev), GFP_KERNEL); if (!ev) return -ENOMEM; ev->type = UHID_OUTPUT; ev->u.output.size = count; ev->u.output.rtype = rtype; memcpy(ev->u.output.data, buf, count); spin_lock_irqsave(&uhid->qlock, flags); uhid_queue(uhid, ev); spin_unlock_irqrestore(&uhid->qlock, flags); return count; } static int uhid_hid_output_report(struct hid_device *hid, __u8 *buf, size_t count) { return uhid_hid_output_raw(hid, buf, count, HID_OUTPUT_REPORT); } static const struct hid_ll_driver uhid_hid_driver = { .start = uhid_hid_start, .stop = uhid_hid_stop, .open = uhid_hid_open, .close = uhid_hid_close, .parse = uhid_hid_parse, .raw_request = uhid_hid_raw_request, .output_report = uhid_hid_output_report, .max_buffer_size = UHID_DATA_MAX, }; #ifdef CONFIG_COMPAT /* Apparently we haven't stepped on these rakes enough times yet. */ struct uhid_create_req_compat { __u8 name[128]; __u8 phys[64]; __u8 uniq[64]; compat_uptr_t rd_data; __u16 rd_size; __u16 bus; __u32 vendor; __u32 product; __u32 version; __u32 country; } __attribute__((__packed__)); static int uhid_event_from_user(const char __user *buffer, size_t len, struct uhid_event *event) { if (in_compat_syscall()) { u32 type; if (get_user(type, buffer)) return -EFAULT; if (type == UHID_CREATE) { /* * This is our messed up request with compat pointer. * It is largish (more than 256 bytes) so we better * allocate it from the heap. */ struct uhid_create_req_compat *compat; compat = kzalloc(sizeof(*compat), GFP_KERNEL); if (!compat) return -ENOMEM; buffer += sizeof(type); len -= sizeof(type); if (copy_from_user(compat, buffer, min(len, sizeof(*compat)))) { kfree(compat); return -EFAULT; } /* Shuffle the data over to proper structure */ event->type = type; memcpy(event->u.create.name, compat->name, sizeof(compat->name)); memcpy(event->u.create.phys, compat->phys, sizeof(compat->phys)); memcpy(event->u.create.uniq, compat->uniq, sizeof(compat->uniq)); event->u.create.rd_data = compat_ptr(compat->rd_data); event->u.create.rd_size = compat->rd_size; event->u.create.bus = compat->bus; event->u.create.vendor = compat->vendor; event->u.create.product = compat->product; event->u.create.version = compat->version; event->u.create.country = compat->country; kfree(compat); return 0; } /* All others can be copied directly */ } if (copy_from_user(event, buffer, min(len, sizeof(*event)))) return -EFAULT; return 0; } #else static int uhid_event_from_user(const char __user *buffer, size_t len, struct uhid_event *event) { if (copy_from_user(event, buffer, min(len, sizeof(*event)))) return -EFAULT; return 0; } #endif static int uhid_dev_create2(struct uhid_device *uhid, const struct uhid_event *ev) { struct hid_device *hid; size_t rd_size; void *rd_data; int ret; if (uhid->hid) return -EALREADY; rd_size = ev->u.create2.rd_size; if (rd_size <= 0 || rd_size > HID_MAX_DESCRIPTOR_SIZE) return -EINVAL; rd_data = kmemdup(ev->u.create2.rd_data, rd_size, GFP_KERNEL); if (!rd_data) return -ENOMEM; uhid->rd_size = rd_size; uhid->rd_data = rd_data; hid = hid_allocate_device(); if (IS_ERR(hid)) { ret = PTR_ERR(hid); goto err_free; } BUILD_BUG_ON(sizeof(hid->name) != sizeof(ev->u.create2.name)); strscpy(hid->name, ev->u.create2.name, sizeof(hid->name)); BUILD_BUG_ON(sizeof(hid->phys) != sizeof(ev->u.create2.phys)); strscpy(hid->phys, ev->u.create2.phys, sizeof(hid->phys)); BUILD_BUG_ON(sizeof(hid->uniq) != sizeof(ev->u.create2.uniq)); strscpy(hid->uniq, ev->u.create2.uniq, sizeof(hid->uniq)); hid->ll_driver = &uhid_hid_driver; hid->bus = ev->u.create2.bus; hid->vendor = ev->u.create2.vendor; hid->product = ev->u.create2.product; hid->version = ev->u.create2.version; hid->country = ev->u.create2.country; hid->driver_data = uhid; hid->dev.parent = uhid_misc.this_device; uhid->hid = hid; uhid->running = true; /* Adding of a HID device is done through a worker, to allow HID drivers * which use feature requests during .probe to work, without they would * be blocked on devlock, which is held by uhid_char_write. */ schedule_work(&uhid->worker); return 0; err_free: kfree(uhid->rd_data); uhid->rd_data = NULL; uhid->rd_size = 0; return ret; } static int uhid_dev_create(struct uhid_device *uhid, struct uhid_event *ev) { struct uhid_create_req orig; orig = ev->u.create; if (orig.rd_size <= 0 || orig.rd_size > HID_MAX_DESCRIPTOR_SIZE) return -EINVAL; if (copy_from_user(&ev->u.create2.rd_data, orig.rd_data, orig.rd_size)) return -EFAULT; memcpy(ev->u.create2.name, orig.name, sizeof(orig.name)); memcpy(ev->u.create2.phys, orig.phys, sizeof(orig.phys)); memcpy(ev->u.create2.uniq, orig.uniq, sizeof(orig.uniq)); ev->u.create2.rd_size = orig.rd_size; ev->u.create2.bus = orig.bus; ev->u.create2.vendor = orig.vendor; ev->u.create2.product = orig.product; ev->u.create2.version = orig.version; ev->u.create2.country = orig.country; return uhid_dev_create2(uhid, ev); } static int uhid_dev_destroy(struct uhid_device *uhid) { if (!uhid->hid) return -EINVAL; WRITE_ONCE(uhid->running, false); wake_up_interruptible(&uhid->report_wait); cancel_work_sync(&uhid->worker); hid_destroy_device(uhid->hid); uhid->hid = NULL; kfree(uhid->rd_data); return 0; } static int uhid_dev_input(struct uhid_device *uhid, struct uhid_event *ev) { if (!READ_ONCE(uhid->running)) return -EINVAL; hid_input_report(uhid->hid, HID_INPUT_REPORT, ev->u.input.data, min_t(size_t, ev->u.input.size, UHID_DATA_MAX), 0); return 0; } static int uhid_dev_input2(struct uhid_device *uhid, struct uhid_event *ev) { if (!READ_ONCE(uhid->running)) return -EINVAL; hid_input_report(uhid->hid, HID_INPUT_REPORT, ev->u.input2.data, min_t(size_t, ev->u.input2.size, UHID_DATA_MAX), 0); return 0; } static int uhid_dev_get_report_reply(struct uhid_device *uhid, struct uhid_event *ev) { if (!READ_ONCE(uhid->running)) return -EINVAL; uhid_report_wake_up(uhid, ev->u.get_report_reply.id, ev); return 0; } static int uhid_dev_set_report_reply(struct uhid_device *uhid, struct uhid_event *ev) { if (!READ_ONCE(uhid->running)) return -EINVAL; uhid_report_wake_up(uhid, ev->u.set_report_reply.id, ev); return 0; } static int uhid_char_open(struct inode *inode, struct file *file) { struct uhid_device *uhid; uhid = kzalloc(sizeof(*uhid), GFP_KERNEL); if (!uhid) return -ENOMEM; mutex_init(&uhid->devlock); mutex_init(&uhid->report_lock); spin_lock_init(&uhid->qlock); init_waitqueue_head(&uhid->waitq); init_waitqueue_head(&uhid->report_wait); uhid->running = false; INIT_WORK(&uhid->worker, uhid_device_add_worker); file->private_data = uhid; stream_open(inode, file); return 0; } static int uhid_char_release(struct inode *inode, struct file *file) { struct uhid_device *uhid = file->private_data; unsigned int i; uhid_dev_destroy(uhid); for (i = 0; i < UHID_BUFSIZE; ++i) kfree(uhid->outq[i]); kfree(uhid); return 0; } static ssize_t uhid_char_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct uhid_device *uhid = file->private_data; int ret; unsigned long flags; size_t len; /* they need at least the "type" member of uhid_event */ if (count < sizeof(__u32)) return -EINVAL; try_again: if (file->f_flags & O_NONBLOCK) { if (uhid->head == uhid->tail) return -EAGAIN; } else { ret = wait_event_interruptible(uhid->waitq, uhid->head != uhid->tail); if (ret) return ret; } ret = mutex_lock_interruptible(&uhid->devlock); if (ret) return ret; if (uhid->head == uhid->tail) { mutex_unlock(&uhid->devlock); goto try_again; } else { len = min(count, sizeof(**uhid->outq)); if (copy_to_user(buffer, uhid->outq[uhid->tail], len)) { ret = -EFAULT; } else { kfree(uhid->outq[uhid->tail]); uhid->outq[uhid->tail] = NULL; spin_lock_irqsave(&uhid->qlock, flags); uhid->tail = (uhid->tail + 1) % UHID_BUFSIZE; spin_unlock_irqrestore(&uhid->qlock, flags); } } mutex_unlock(&uhid->devlock); return ret ? ret : len; } static ssize_t uhid_char_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct uhid_device *uhid = file->private_data; int ret; size_t len; /* we need at least the "type" member of uhid_event */ if (count < sizeof(__u32)) return -EINVAL; ret = mutex_lock_interruptible(&uhid->devlock); if (ret) return ret; memset(&uhid->input_buf, 0, sizeof(uhid->input_buf)); len = min(count, sizeof(uhid->input_buf)); ret = uhid_event_from_user(buffer, len, &uhid->input_buf); if (ret) goto unlock; switch (uhid->input_buf.type) { case UHID_CREATE: /* * 'struct uhid_create_req' contains a __user pointer which is * copied from, so it's unsafe to allow this with elevated * privileges (e.g. from a setuid binary) or via kernel_write(). */ if (file->f_cred != current_cred()) { pr_err_once("UHID_CREATE from different security context by process %d (%s), this is not allowed.\n", task_tgid_vnr(current), current->comm); ret = -EACCES; goto unlock; } ret = uhid_dev_create(uhid, &uhid->input_buf); break; case UHID_CREATE2: ret = uhid_dev_create2(uhid, &uhid->input_buf); break; case UHID_DESTROY: ret = uhid_dev_destroy(uhid); break; case UHID_INPUT: ret = uhid_dev_input(uhid, &uhid->input_buf); break; case UHID_INPUT2: ret = uhid_dev_input2(uhid, &uhid->input_buf); break; case UHID_GET_REPORT_REPLY: ret = uhid_dev_get_report_reply(uhid, &uhid->input_buf); break; case UHID_SET_REPORT_REPLY: ret = uhid_dev_set_report_reply(uhid, &uhid->input_buf); break; default: ret = -EOPNOTSUPP; } unlock: mutex_unlock(&uhid->devlock); /* return "count" not "len" to not confuse the caller */ return ret ? ret : count; } static __poll_t uhid_char_poll(struct file *file, poll_table *wait) { struct uhid_device *uhid = file->private_data; __poll_t mask = EPOLLOUT | EPOLLWRNORM; /* uhid is always writable */ poll_wait(file, &uhid->waitq, wait); if (uhid->head != uhid->tail) mask |= EPOLLIN | EPOLLRDNORM; return mask; } static const struct file_operations uhid_fops = { .owner = THIS_MODULE, .open = uhid_char_open, .release = uhid_char_release, .read = uhid_char_read, .write = uhid_char_write, .poll = uhid_char_poll, }; static struct miscdevice uhid_misc = { .fops = &uhid_fops, .minor = UHID_MINOR, .name = UHID_NAME, }; module_misc_device(uhid_misc); MODULE_LICENSE("GPL"); MODULE_AUTHOR("David Herrmann <dh.herrmann@gmail.com>"); MODULE_DESCRIPTION("User-space I/O driver support for HID subsystem"); MODULE_ALIAS_MISCDEV(UHID_MINOR); MODULE_ALIAS("devname:" UHID_NAME);
43 2 39 51 5 2 3 44 4 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 // SPDX-License-Identifier: GPL-2.0-only /* * VMware vSockets Driver * * Copyright (C) 2007-2012 VMware, Inc. All rights reserved. */ #include <linux/types.h> #include <linux/socket.h> #include <linux/stddef.h> #include <net/sock.h> #include <net/vsock_addr.h> void vsock_addr_init(struct sockaddr_vm *addr, u32 cid, u32 port) { memset(addr, 0, sizeof(*addr)); addr->svm_family = AF_VSOCK; addr->svm_cid = cid; addr->svm_port = port; } EXPORT_SYMBOL_GPL(vsock_addr_init); int vsock_addr_validate(const struct sockaddr_vm *addr) { __u8 svm_valid_flags = VMADDR_FLAG_TO_HOST; if (!addr) return -EFAULT; if (addr->svm_family != AF_VSOCK) return -EAFNOSUPPORT; if (addr->svm_flags & ~svm_valid_flags) return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(vsock_addr_validate); bool vsock_addr_bound(const struct sockaddr_vm *addr) { return addr->svm_port != VMADDR_PORT_ANY; } EXPORT_SYMBOL_GPL(vsock_addr_bound); void vsock_addr_unbind(struct sockaddr_vm *addr) { vsock_addr_init(addr, VMADDR_CID_ANY, VMADDR_PORT_ANY); } EXPORT_SYMBOL_GPL(vsock_addr_unbind); bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, const struct sockaddr_vm *other) { return addr->svm_cid == other->svm_cid && addr->svm_port == other->svm_port; } EXPORT_SYMBOL_GPL(vsock_addr_equals_addr); int vsock_addr_cast(const struct sockaddr *addr, size_t len, struct sockaddr_vm **out_addr) { if (len < sizeof(**out_addr)) return -EFAULT; *out_addr = (struct sockaddr_vm *)addr; return vsock_addr_validate(*out_addr); } EXPORT_SYMBOL_GPL(vsock_addr_cast);
9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Header file for dma buffer sharing framework. * * Copyright(C) 2011 Linaro Limited. All rights reserved. * Author: Sumit Semwal <sumit.semwal@ti.com> * * Many thanks to linaro-mm-sig list, and specially * Arnd Bergmann <arnd@arndb.de>, Rob Clark <rob@ti.com> and * Daniel Vetter <daniel@ffwll.ch> for their support in creation and * refining of this idea. */ #ifndef __DMA_BUF_H__ #define __DMA_BUF_H__ #include <linux/iosys-map.h> #include <linux/file.h> #include <linux/err.h> #include <linux/scatterlist.h> #include <linux/list.h> #include <linux/dma-mapping.h> #include <linux/fs.h> #include <linux/dma-fence.h> #include <linux/wait.h> struct device; struct dma_buf; struct dma_buf_attachment; /** * struct dma_buf_ops - operations possible on struct dma_buf * @vmap: [optional] creates a virtual mapping for the buffer into kernel * address space. Same restrictions as for vmap and friends apply. * @vunmap: [optional] unmaps a vmap from the buffer */ struct dma_buf_ops { /** * @cache_sgt_mapping: * * If true the framework will cache the first mapping made for each * attachment. This avoids creating mappings for attachments multiple * times. */ bool cache_sgt_mapping; /** * @attach: * * This is called from dma_buf_attach() to make sure that a given * &dma_buf_attachment.dev can access the provided &dma_buf. Exporters * which support buffer objects in special locations like VRAM or * device-specific carveout areas should check whether the buffer could * be move to system memory (or directly accessed by the provided * device), and otherwise need to fail the attach operation. * * The exporter should also in general check whether the current * allocation fulfills the DMA constraints of the new device. If this * is not the case, and the allocation cannot be moved, it should also * fail the attach operation. * * Any exporter-private housekeeping data can be stored in the * &dma_buf_attachment.priv pointer. * * This callback is optional. * * Returns: * * 0 on success, negative error code on failure. It might return -EBUSY * to signal that backing storage is already allocated and incompatible * with the requirements of requesting device. */ int (*attach)(struct dma_buf *, struct dma_buf_attachment *); /** * @detach: * * This is called by dma_buf_detach() to release a &dma_buf_attachment. * Provided so that exporters can clean up any housekeeping for an * &dma_buf_attachment. * * This callback is optional. */ void (*detach)(struct dma_buf *, struct dma_buf_attachment *); /** * @pin: * * This is called by dma_buf_pin() and lets the exporter know that the * DMA-buf can't be moved any more. Ideally, the exporter should * pin the buffer so that it is generally accessible by all * devices. * * This is called with the &dmabuf.resv object locked and is mutual * exclusive with @cache_sgt_mapping. * * This is called automatically for non-dynamic importers from * dma_buf_attach(). * * Note that similar to non-dynamic exporters in their @map_dma_buf * callback the driver must guarantee that the memory is available for * use and cleared of any old data by the time this function returns. * Drivers which pipeline their buffer moves internally must wait for * all moves and clears to complete. * * Returns: * * 0 on success, negative error code on failure. */ int (*pin)(struct dma_buf_attachment *attach); /** * @unpin: * * This is called by dma_buf_unpin() and lets the exporter know that the * DMA-buf can be moved again. * * This is called with the dmabuf->resv object locked and is mutual * exclusive with @cache_sgt_mapping. * * This callback is optional. */ void (*unpin)(struct dma_buf_attachment *attach); /** * @map_dma_buf: * * This is called by dma_buf_map_attachment() and is used to map a * shared &dma_buf into device address space, and it is mandatory. It * can only be called if @attach has been called successfully. * * This call may sleep, e.g. when the backing storage first needs to be * allocated, or moved to a location suitable for all currently attached * devices. * * Note that any specific buffer attributes required for this function * should get added to device_dma_parameters accessible via * &device.dma_params from the &dma_buf_attachment. The @attach callback * should also check these constraints. * * If this is being called for the first time, the exporter can now * choose to scan through the list of attachments for this buffer, * collate the requirements of the attached devices, and choose an * appropriate backing storage for the buffer. * * Based on enum dma_data_direction, it might be possible to have * multiple users accessing at the same time (for reading, maybe), or * any other kind of sharing that the exporter might wish to make * available to buffer-users. * * This is always called with the dmabuf->resv object locked when * the dynamic_mapping flag is true. * * Note that for non-dynamic exporters the driver must guarantee that * that the memory is available for use and cleared of any old data by * the time this function returns. Drivers which pipeline their buffer * moves internally must wait for all moves and clears to complete. * Dynamic exporters do not need to follow this rule: For non-dynamic * importers the buffer is already pinned through @pin, which has the * same requirements. Dynamic importers otoh are required to obey the * dma_resv fences. * * Returns: * * A &sg_table scatter list of the backing storage of the DMA buffer, * already mapped into the device address space of the &device attached * with the provided &dma_buf_attachment. The addresses and lengths in * the scatter list are PAGE_SIZE aligned. * * On failure, returns a negative error value wrapped into a pointer. * May also return -EINTR when a signal was received while being * blocked. * * Note that exporters should not try to cache the scatter list, or * return the same one for multiple calls. Caching is done either by the * DMA-BUF code (for non-dynamic importers) or the importer. Ownership * of the scatter list is transferred to the caller, and returned by * @unmap_dma_buf. */ struct sg_table * (*map_dma_buf)(struct dma_buf_attachment *, enum dma_data_direction); /** * @unmap_dma_buf: * * This is called by dma_buf_unmap_attachment() and should unmap and * release the &sg_table allocated in @map_dma_buf, and it is mandatory. * For static dma_buf handling this might also unpin the backing * storage if this is the last mapping of the DMA buffer. */ void (*unmap_dma_buf)(struct dma_buf_attachment *, struct sg_table *, enum dma_data_direction); /* TODO: Add try_map_dma_buf version, to return immed with -EBUSY * if the call would block. */ /** * @release: * * Called after the last dma_buf_put to release the &dma_buf, and * mandatory. */ void (*release)(struct dma_buf *); /** * @begin_cpu_access: * * This is called from dma_buf_begin_cpu_access() and allows the * exporter to ensure that the memory is actually coherent for cpu * access. The exporter also needs to ensure that cpu access is coherent * for the access direction. The direction can be used by the exporter * to optimize the cache flushing, i.e. access with a different * direction (read instead of write) might return stale or even bogus * data (e.g. when the exporter needs to copy the data to temporary * storage). * * Note that this is both called through the DMA_BUF_IOCTL_SYNC IOCTL * command for userspace mappings established through @mmap, and also * for kernel mappings established with @vmap. * * This callback is optional. * * Returns: * * 0 on success or a negative error code on failure. This can for * example fail when the backing storage can't be allocated. Can also * return -ERESTARTSYS or -EINTR when the call has been interrupted and * needs to be restarted. */ int (*begin_cpu_access)(struct dma_buf *, enum dma_data_direction); /** * @end_cpu_access: * * This is called from dma_buf_end_cpu_access() when the importer is * done accessing the CPU. The exporter can use this to flush caches and * undo anything else done in @begin_cpu_access. * * This callback is optional. * * Returns: * * 0 on success or a negative error code on failure. Can return * -ERESTARTSYS or -EINTR when the call has been interrupted and needs * to be restarted. */ int (*end_cpu_access)(struct dma_buf *, enum dma_data_direction); /** * @mmap: * * This callback is used by the dma_buf_mmap() function * * Note that the mapping needs to be incoherent, userspace is expected * to bracket CPU access using the DMA_BUF_IOCTL_SYNC interface. * * Because dma-buf buffers have invariant size over their lifetime, the * dma-buf core checks whether a vma is too large and rejects such * mappings. The exporter hence does not need to duplicate this check. * Drivers do not need to check this themselves. * * If an exporter needs to manually flush caches and hence needs to fake * coherency for mmap support, it needs to be able to zap all the ptes * pointing at the backing storage. Now linux mm needs a struct * address_space associated with the struct file stored in vma->vm_file * to do that with the function unmap_mapping_range. But the dma_buf * framework only backs every dma_buf fd with the anon_file struct file, * i.e. all dma_bufs share the same file. * * Hence exporters need to setup their own file (and address_space) * association by setting vma->vm_file and adjusting vma->vm_pgoff in * the dma_buf mmap callback. In the specific case of a gem driver the * exporter could use the shmem file already provided by gem (and set * vm_pgoff = 0). Exporters can then zap ptes by unmapping the * corresponding range of the struct address_space associated with their * own file. * * This callback is optional. * * Returns: * * 0 on success or a negative error code on failure. */ int (*mmap)(struct dma_buf *, struct vm_area_struct *vma); int (*vmap)(struct dma_buf *dmabuf, struct iosys_map *map); void (*vunmap)(struct dma_buf *dmabuf, struct iosys_map *map); }; /** * struct dma_buf - shared buffer object * * This represents a shared buffer, created by calling dma_buf_export(). The * userspace representation is a normal file descriptor, which can be created by * calling dma_buf_fd(). * * Shared dma buffers are reference counted using dma_buf_put() and * get_dma_buf(). * * Device DMA access is handled by the separate &struct dma_buf_attachment. */ struct dma_buf { /** * @size: * * Size of the buffer; invariant over the lifetime of the buffer. */ size_t size; /** * @file: * * File pointer used for sharing buffers across, and for refcounting. * See dma_buf_get() and dma_buf_put(). */ struct file *file; /** * @attachments: * * List of dma_buf_attachment that denotes all devices attached, * protected by &dma_resv lock @resv. */ struct list_head attachments; /** @ops: dma_buf_ops associated with this buffer object. */ const struct dma_buf_ops *ops; /** * @vmapping_counter: * * Used internally to refcnt the vmaps returned by dma_buf_vmap(). * Protected by @lock. */ unsigned vmapping_counter; /** * @vmap_ptr: * The current vmap ptr if @vmapping_counter > 0. Protected by @lock. */ struct iosys_map vmap_ptr; /** * @exp_name: * * Name of the exporter; useful for debugging. Must not be NULL */ const char *exp_name; /** * @name: * * Userspace-provided name. Default value is NULL. If not NULL, * length cannot be longer than DMA_BUF_NAME_LEN, including NIL * char. Useful for accounting and debugging. Read/Write accesses * are protected by @name_lock * * See the IOCTLs DMA_BUF_SET_NAME or DMA_BUF_SET_NAME_A/B */ const char *name; /** @name_lock: Spinlock to protect name access for read access. */ spinlock_t name_lock; /** * @owner: * * Pointer to exporter module; used for refcounting when exporter is a * kernel module. */ struct module *owner; #if IS_ENABLED(CONFIG_DEBUG_FS) /** @list_node: node for dma_buf accounting and debugging. */ struct list_head list_node; #endif /** @priv: exporter specific private data for this buffer object. */ void *priv; /** * @resv: * * Reservation object linked to this dma-buf. * * IMPLICIT SYNCHRONIZATION RULES: * * Drivers which support implicit synchronization of buffer access as * e.g. exposed in `Implicit Fence Poll Support`_ must follow the * below rules. * * - Drivers must add a read fence through dma_resv_add_fence() with the * DMA_RESV_USAGE_READ flag for anything the userspace API considers a * read access. This highly depends upon the API and window system. * * - Similarly drivers must add a write fence through * dma_resv_add_fence() with the DMA_RESV_USAGE_WRITE flag for * anything the userspace API considers write access. * * - Drivers may just always add a write fence, since that only * causes unnecessary synchronization, but no correctness issues. * * - Some drivers only expose a synchronous userspace API with no * pipelining across drivers. These do not set any fences for their * access. An example here is v4l. * * - Driver should use dma_resv_usage_rw() when retrieving fences as * dependency for implicit synchronization. * * DYNAMIC IMPORTER RULES: * * Dynamic importers, see dma_buf_attachment_is_dynamic(), have * additional constraints on how they set up fences: * * - Dynamic importers must obey the write fences and wait for them to * signal before allowing access to the buffer's underlying storage * through the device. * * - Dynamic importers should set fences for any access that they can't * disable immediately from their &dma_buf_attach_ops.move_notify * callback. * * IMPORTANT: * * All drivers and memory management related functions must obey the * struct dma_resv rules, specifically the rules for updating and * obeying fences. See enum dma_resv_usage for further descriptions. */ struct dma_resv *resv; /** @poll: for userspace poll support */ wait_queue_head_t poll; /** @cb_in: for userspace poll support */ /** @cb_out: for userspace poll support */ struct dma_buf_poll_cb_t { struct dma_fence_cb cb; wait_queue_head_t *poll; __poll_t active; } cb_in, cb_out; #ifdef CONFIG_DMABUF_SYSFS_STATS /** * @sysfs_entry: * * For exposing information about this buffer in sysfs. See also * `DMA-BUF statistics`_ for the uapi this enables. */ struct dma_buf_sysfs_entry { struct kobject kobj; struct dma_buf *dmabuf; } *sysfs_entry; #endif }; /** * struct dma_buf_attach_ops - importer operations for an attachment * * Attachment operations implemented by the importer. */ struct dma_buf_attach_ops { /** * @allow_peer2peer: * * If this is set to true the importer must be able to handle peer * resources without struct pages. */ bool allow_peer2peer; /** * @move_notify: [optional] notification that the DMA-buf is moving * * If this callback is provided the framework can avoid pinning the * backing store while mappings exists. * * This callback is called with the lock of the reservation object * associated with the dma_buf held and the mapping function must be * called with this lock held as well. This makes sure that no mapping * is created concurrently with an ongoing move operation. * * Mappings stay valid and are not directly affected by this callback. * But the DMA-buf can now be in a different physical location, so all * mappings should be destroyed and re-created as soon as possible. * * New mappings can be created after this callback returns, and will * point to the new location of the DMA-buf. */ void (*move_notify)(struct dma_buf_attachment *attach); }; /** * struct dma_buf_attachment - holds device-buffer attachment data * @dmabuf: buffer for this attachment. * @dev: device attached to the buffer. * @node: list of dma_buf_attachment, protected by dma_resv lock of the dmabuf. * @sgt: cached mapping. * @dir: direction of cached mapping. * @peer2peer: true if the importer can handle peer resources without pages. * @priv: exporter specific attachment data. * @importer_ops: importer operations for this attachment, if provided * dma_buf_map/unmap_attachment() must be called with the dma_resv lock held. * @importer_priv: importer specific attachment data. * * This structure holds the attachment information between the dma_buf buffer * and its user device(s). The list contains one attachment struct per device * attached to the buffer. * * An attachment is created by calling dma_buf_attach(), and released again by * calling dma_buf_detach(). The DMA mapping itself needed to initiate a * transfer is created by dma_buf_map_attachment() and freed again by calling * dma_buf_unmap_attachment(). */ struct dma_buf_attachment { struct dma_buf *dmabuf; struct device *dev; struct list_head node; struct sg_table *sgt; enum dma_data_direction dir; bool peer2peer; const struct dma_buf_attach_ops *importer_ops; void *importer_priv; void *priv; }; /** * struct dma_buf_export_info - holds information needed to export a dma_buf * @exp_name: name of the exporter - useful for debugging. * @owner: pointer to exporter module - used for refcounting kernel module * @ops: Attach allocator-defined dma buf ops to the new buffer * @size: Size of the buffer - invariant over the lifetime of the buffer * @flags: mode flags for the file * @resv: reservation-object, NULL to allocate default one * @priv: Attach private data of allocator to this buffer * * This structure holds the information required to export the buffer. Used * with dma_buf_export() only. */ struct dma_buf_export_info { const char *exp_name; struct module *owner; const struct dma_buf_ops *ops; size_t size; int flags; struct dma_resv *resv; void *priv; }; /** * DEFINE_DMA_BUF_EXPORT_INFO - helper macro for exporters * @name: export-info name * * DEFINE_DMA_BUF_EXPORT_INFO macro defines the &struct dma_buf_export_info, * zeroes it out and pre-populates exp_name in it. */ #define DEFINE_DMA_BUF_EXPORT_INFO(name) \ struct dma_buf_export_info name = { .exp_name = KBUILD_MODNAME, \ .owner = THIS_MODULE } /** * get_dma_buf - convenience wrapper for get_file. * @dmabuf: [in] pointer to dma_buf * * Increments the reference count on the dma-buf, needed in case of drivers * that either need to create additional references to the dmabuf on the * kernel side. For example, an exporter that needs to keep a dmabuf ptr * so that subsequent exports don't create a new dmabuf. */ static inline void get_dma_buf(struct dma_buf *dmabuf) { get_file(dmabuf->file); } /** * dma_buf_is_dynamic - check if a DMA-buf uses dynamic mappings. * @dmabuf: the DMA-buf to check * * Returns true if a DMA-buf exporter wants to be called with the dma_resv * locked for the map/unmap callbacks, false if it doesn't wants to be called * with the lock held. */ static inline bool dma_buf_is_dynamic(struct dma_buf *dmabuf) { return !!dmabuf->ops->pin; } /** * dma_buf_attachment_is_dynamic - check if a DMA-buf attachment uses dynamic * mappings * @attach: the DMA-buf attachment to check * * Returns true if a DMA-buf importer wants to call the map/unmap functions with * the dma_resv lock held. */ static inline bool dma_buf_attachment_is_dynamic(struct dma_buf_attachment *attach) { return !!attach->importer_ops; } struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf, struct device *dev); struct dma_buf_attachment * dma_buf_dynamic_attach(struct dma_buf *dmabuf, struct device *dev, const struct dma_buf_attach_ops *importer_ops, void *importer_priv); void dma_buf_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attach); int dma_buf_pin(struct dma_buf_attachment *attach); void dma_buf_unpin(struct dma_buf_attachment *attach); struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info); int dma_buf_fd(struct dma_buf *dmabuf, int flags); struct dma_buf *dma_buf_get(int fd); void dma_buf_put(struct dma_buf *dmabuf); struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *, enum dma_data_direction); void dma_buf_unmap_attachment(struct dma_buf_attachment *, struct sg_table *, enum dma_data_direction); void dma_buf_move_notify(struct dma_buf *dma_buf); int dma_buf_begin_cpu_access(struct dma_buf *dma_buf, enum dma_data_direction dir); int dma_buf_end_cpu_access(struct dma_buf *dma_buf, enum dma_data_direction dir); struct sg_table * dma_buf_map_attachment_unlocked(struct dma_buf_attachment *attach, enum dma_data_direction direction); void dma_buf_unmap_attachment_unlocked(struct dma_buf_attachment *attach, struct sg_table *sg_table, enum dma_data_direction direction); int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *, unsigned long); int dma_buf_vmap(struct dma_buf *dmabuf, struct iosys_map *map); void dma_buf_vunmap(struct dma_buf *dmabuf, struct iosys_map *map); int dma_buf_vmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map); void dma_buf_vunmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map); #endif /* __DMA_BUF_H__ */
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/sunrpc/auth.h * * Declarations for the RPC client authentication machinery. * * Copyright (C) 1996, Olaf Kirch <okir@monad.swb.de> */ #ifndef _LINUX_SUNRPC_AUTH_H #define _LINUX_SUNRPC_AUTH_H #include <linux/sunrpc/sched.h> #include <linux/sunrpc/msg_prot.h> #include <linux/sunrpc/xdr.h> #include <linux/atomic.h> #include <linux/rcupdate.h> #include <linux/uidgid.h> #include <linux/utsname.h> /* * Maximum size of AUTH_NONE authentication information, in XDR words. */ #define NUL_CALLSLACK (4) #define NUL_REPLYSLACK (2) /* * Size of the nodename buffer. RFC1831 specifies a hard limit of 255 bytes, * but Linux hostnames are actually limited to __NEW_UTS_LEN bytes. */ #define UNX_MAXNODENAME __NEW_UTS_LEN #define UNX_CALLSLACK (21 + XDR_QUADLEN(UNX_MAXNODENAME)) #define UNX_NGROUPS 16 struct rpcsec_gss_info; struct auth_cred { const struct cred *cred; const char *principal; /* If present, this is a machine credential */ }; /* * Client user credentials */ struct rpc_auth; struct rpc_credops; struct rpc_cred { struct hlist_node cr_hash; /* hash chain */ struct list_head cr_lru; /* lru garbage collection */ struct rcu_head cr_rcu; struct rpc_auth * cr_auth; const struct rpc_credops *cr_ops; unsigned long cr_expire; /* when to gc */ unsigned long cr_flags; /* various flags */ refcount_t cr_count; /* ref count */ const struct cred *cr_cred; /* per-flavor data */ }; #define RPCAUTH_CRED_NEW 0 #define RPCAUTH_CRED_UPTODATE 1 #define RPCAUTH_CRED_HASHED 2 #define RPCAUTH_CRED_NEGATIVE 3 const struct cred *rpc_machine_cred(void); /* * Client authentication handle */ struct rpc_cred_cache; struct rpc_authops; struct rpc_auth { unsigned int au_cslack; /* call cred size estimate */ unsigned int au_rslack; /* reply cred size estimate */ unsigned int au_verfsize; /* size of reply verifier */ unsigned int au_ralign; /* words before UL header */ unsigned long au_flags; const struct rpc_authops *au_ops; rpc_authflavor_t au_flavor; /* pseudoflavor (note may * differ from the flavor in * au_ops->au_flavor in gss * case) */ refcount_t au_count; /* Reference counter */ struct rpc_cred_cache * au_credcache; /* per-flavor data */ }; /* rpc_auth au_flags */ #define RPCAUTH_AUTH_DATATOUCH (1) #define RPCAUTH_AUTH_UPDATE_SLACK (2) struct rpc_auth_create_args { rpc_authflavor_t pseudoflavor; const char *target_name; }; /* Flags for rpcauth_lookupcred() */ #define RPCAUTH_LOOKUP_NEW 0x01 /* Accept an uninitialised cred */ #define RPCAUTH_LOOKUP_ASYNC 0x02 /* Don't block waiting for memory */ /* * Client authentication ops */ struct rpc_authops { struct module *owner; rpc_authflavor_t au_flavor; /* flavor (RPC_AUTH_*) */ char * au_name; struct rpc_auth * (*create)(const struct rpc_auth_create_args *, struct rpc_clnt *); void (*destroy)(struct rpc_auth *); int (*hash_cred)(struct auth_cred *, unsigned int); struct rpc_cred * (*lookup_cred)(struct rpc_auth *, struct auth_cred *, int); struct rpc_cred * (*crcreate)(struct rpc_auth*, struct auth_cred *, int, gfp_t); rpc_authflavor_t (*info2flavor)(struct rpcsec_gss_info *); int (*flavor2info)(rpc_authflavor_t, struct rpcsec_gss_info *); int (*key_timeout)(struct rpc_auth *, struct rpc_cred *); int (*ping)(struct rpc_clnt *clnt); }; struct rpc_credops { const char * cr_name; /* Name of the auth flavour */ int (*cr_init)(struct rpc_auth *, struct rpc_cred *); void (*crdestroy)(struct rpc_cred *); int (*crmatch)(struct auth_cred *, struct rpc_cred *, int); int (*crmarshal)(struct rpc_task *task, struct xdr_stream *xdr); int (*crrefresh)(struct rpc_task *); int (*crvalidate)(struct rpc_task *task, struct xdr_stream *xdr); int (*crwrap_req)(struct rpc_task *task, struct xdr_stream *xdr); int (*crunwrap_resp)(struct rpc_task *task, struct xdr_stream *xdr); int (*crkey_timeout)(struct rpc_cred *); char * (*crstringify_acceptor)(struct rpc_cred *); bool (*crneed_reencode)(struct rpc_task *); }; extern const struct rpc_authops authunix_ops; extern const struct rpc_authops authnull_ops; extern const struct rpc_authops authtls_ops; int __init rpc_init_authunix(void); int __init rpcauth_init_module(void); void rpcauth_remove_module(void); void rpc_destroy_authunix(void); int rpcauth_register(const struct rpc_authops *); int rpcauth_unregister(const struct rpc_authops *); struct rpc_auth * rpcauth_create(const struct rpc_auth_create_args *, struct rpc_clnt *); void rpcauth_release(struct rpc_auth *); rpc_authflavor_t rpcauth_get_pseudoflavor(rpc_authflavor_t, struct rpcsec_gss_info *); int rpcauth_get_gssinfo(rpc_authflavor_t, struct rpcsec_gss_info *); struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int, gfp_t); void rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *); struct rpc_cred * rpcauth_lookupcred(struct rpc_auth *, int); void put_rpccred(struct rpc_cred *); int rpcauth_marshcred(struct rpc_task *task, struct xdr_stream *xdr); int rpcauth_checkverf(struct rpc_task *task, struct xdr_stream *xdr); int rpcauth_wrap_req_encode(struct rpc_task *task, struct xdr_stream *xdr); int rpcauth_wrap_req(struct rpc_task *task, struct xdr_stream *xdr); int rpcauth_unwrap_resp_decode(struct rpc_task *task, struct xdr_stream *xdr); int rpcauth_unwrap_resp(struct rpc_task *task, struct xdr_stream *xdr); bool rpcauth_xmit_need_reencode(struct rpc_task *task); int rpcauth_refreshcred(struct rpc_task *); void rpcauth_invalcred(struct rpc_task *); int rpcauth_uptodatecred(struct rpc_task *); int rpcauth_init_credcache(struct rpc_auth *); void rpcauth_destroy_credcache(struct rpc_auth *); void rpcauth_clear_credcache(struct rpc_cred_cache *); char * rpcauth_stringify_acceptor(struct rpc_cred *); static inline struct rpc_cred *get_rpccred(struct rpc_cred *cred) { if (cred != NULL && refcount_inc_not_zero(&cred->cr_count)) return cred; return NULL; } #endif /* _LINUX_SUNRPC_AUTH_H */
148 79 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Hash algorithms. * * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_INTERNAL_HASH_H #define _CRYPTO_INTERNAL_HASH_H #include <crypto/algapi.h> #include <crypto/hash.h> struct ahash_request; struct scatterlist; struct crypto_hash_walk { char *data; unsigned int offset; unsigned int flags; struct page *pg; unsigned int entrylen; unsigned int total; struct scatterlist *sg; }; struct ahash_instance { void (*free)(struct ahash_instance *inst); union { struct { char head[offsetof(struct ahash_alg, halg.base)]; struct crypto_instance base; } s; struct ahash_alg alg; }; }; struct shash_instance { void (*free)(struct shash_instance *inst); union { struct { char head[offsetof(struct shash_alg, base)]; struct crypto_instance base; } s; struct shash_alg alg; }; }; struct crypto_ahash_spawn { struct crypto_spawn base; }; struct crypto_shash_spawn { struct crypto_spawn base; }; int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err); int crypto_hash_walk_first(struct ahash_request *req, struct crypto_hash_walk *walk); static inline int crypto_hash_walk_last(struct crypto_hash_walk *walk) { return !(walk->entrylen | walk->total); } int crypto_register_ahash(struct ahash_alg *alg); void crypto_unregister_ahash(struct ahash_alg *alg); int crypto_register_ahashes(struct ahash_alg *algs, int count); void crypto_unregister_ahashes(struct ahash_alg *algs, int count); int ahash_register_instance(struct crypto_template *tmpl, struct ahash_instance *inst); int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen); static inline bool crypto_shash_alg_has_setkey(struct shash_alg *alg) { return alg->setkey != shash_no_setkey; } static inline bool crypto_shash_alg_needs_key(struct shash_alg *alg) { return crypto_shash_alg_has_setkey(alg) && !(alg->base.cra_flags & CRYPTO_ALG_OPTIONAL_KEY); } int crypto_grab_ahash(struct crypto_ahash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); static inline void crypto_drop_ahash(struct crypto_ahash_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline struct hash_alg_common *crypto_spawn_ahash_alg( struct crypto_ahash_spawn *spawn) { return __crypto_hash_alg_common(spawn->base.alg); } int crypto_register_shash(struct shash_alg *alg); void crypto_unregister_shash(struct shash_alg *alg); int crypto_register_shashes(struct shash_alg *algs, int count); void crypto_unregister_shashes(struct shash_alg *algs, int count); int shash_register_instance(struct crypto_template *tmpl, struct shash_instance *inst); void shash_free_singlespawn_instance(struct shash_instance *inst); int crypto_grab_shash(struct crypto_shash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); static inline void crypto_drop_shash(struct crypto_shash_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline struct shash_alg *crypto_spawn_shash_alg( struct crypto_shash_spawn *spawn) { return __crypto_shash_alg(spawn->base.alg); } int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc); int shash_ahash_finup(struct ahash_request *req, struct shash_desc *desc); int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc); static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm) { return crypto_tfm_ctx(crypto_ahash_tfm(tfm)); } static inline void *crypto_ahash_ctx_dma(struct crypto_ahash *tfm) { return crypto_tfm_ctx_dma(crypto_ahash_tfm(tfm)); } static inline struct ahash_alg *__crypto_ahash_alg(struct crypto_alg *alg) { return container_of(__crypto_hash_alg_common(alg), struct ahash_alg, halg); } static inline struct ahash_alg *crypto_ahash_alg(struct crypto_ahash *hash) { return container_of(crypto_hash_alg_common(hash), struct ahash_alg, halg); } static inline void crypto_ahash_set_statesize(struct crypto_ahash *tfm, unsigned int size) { tfm->statesize = size; } static inline void crypto_ahash_set_reqsize(struct crypto_ahash *tfm, unsigned int reqsize) { tfm->reqsize = reqsize; } static inline void crypto_ahash_set_reqsize_dma(struct crypto_ahash *ahash, unsigned int reqsize) { reqsize += crypto_dma_align() & ~(crypto_tfm_ctx_alignment() - 1); ahash->reqsize = reqsize; } static inline struct crypto_instance *ahash_crypto_instance( struct ahash_instance *inst) { return &inst->s.base; } static inline struct ahash_instance *ahash_instance( struct crypto_instance *inst) { return container_of(inst, struct ahash_instance, s.base); } static inline struct ahash_instance *ahash_alg_instance( struct crypto_ahash *ahash) { return ahash_instance(crypto_tfm_alg_instance(&ahash->base)); } static inline void *ahash_instance_ctx(struct ahash_instance *inst) { return crypto_instance_ctx(ahash_crypto_instance(inst)); } static inline void *ahash_request_ctx_dma(struct ahash_request *req) { unsigned int align = crypto_dma_align(); if (align <= crypto_tfm_ctx_alignment()) align = 1; return PTR_ALIGN(ahash_request_ctx(req), align); } static inline void ahash_request_complete(struct ahash_request *req, int err) { crypto_request_complete(&req->base, err); } static inline u32 ahash_request_flags(struct ahash_request *req) { return req->base.flags; } static inline struct crypto_ahash *crypto_spawn_ahash( struct crypto_ahash_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline int ahash_enqueue_request(struct crypto_queue *queue, struct ahash_request *request) { return crypto_enqueue_request(queue, &request->base); } static inline struct ahash_request *ahash_dequeue_request( struct crypto_queue *queue) { return ahash_request_cast(crypto_dequeue_request(queue)); } static inline void *crypto_shash_ctx(struct crypto_shash *tfm) { return crypto_tfm_ctx(&tfm->base); } static inline struct crypto_instance *shash_crypto_instance( struct shash_instance *inst) { return &inst->s.base; } static inline struct shash_instance *shash_instance( struct crypto_instance *inst) { return container_of(inst, struct shash_instance, s.base); } static inline struct shash_instance *shash_alg_instance( struct crypto_shash *shash) { return shash_instance(crypto_tfm_alg_instance(&shash->base)); } static inline void *shash_instance_ctx(struct shash_instance *inst) { return crypto_instance_ctx(shash_crypto_instance(inst)); } static inline struct crypto_shash *crypto_spawn_shash( struct crypto_shash_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline struct crypto_shash *__crypto_shash_cast(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_shash, base); } #endif /* _CRYPTO_INTERNAL_HASH_H */
1 1 1 1 1 1 1 1 1 1 1 1 3 3 1 2 3 3 2 2 3 2 2 1 3 2 1 3 3 3 3 3 1 3 1 1 3 3 3 3 3 3 3 3 1 3 33 33 33 33 33 33 6 33 33 33 32 414 414 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/condition.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /* List of "struct tomoyo_condition". */ LIST_HEAD(tomoyo_condition_list); /** * tomoyo_argv - Check argv[] in "struct linux_binbrm". * * @index: Index number of @arg_ptr. * @arg_ptr: Contents of argv[@index]. * @argc: Length of @argv. * @argv: Pointer to "struct tomoyo_argv". * @checked: Set to true if @argv[@index] was found. * * Returns true on success, false otherwise. */ static bool tomoyo_argv(const unsigned int index, const char *arg_ptr, const int argc, const struct tomoyo_argv *argv, u8 *checked) { int i; struct tomoyo_path_info arg; arg.name = arg_ptr; for (i = 0; i < argc; argv++, checked++, i++) { bool result; if (index != argv->index) continue; *checked = 1; tomoyo_fill_path_info(&arg); result = tomoyo_path_matches_pattern(&arg, argv->value); if (argv->is_not) result = !result; if (!result) return false; } return true; } /** * tomoyo_envp - Check envp[] in "struct linux_binbrm". * * @env_name: The name of environment variable. * @env_value: The value of environment variable. * @envc: Length of @envp. * @envp: Pointer to "struct tomoyo_envp". * @checked: Set to true if @envp[@env_name] was found. * * Returns true on success, false otherwise. */ static bool tomoyo_envp(const char *env_name, const char *env_value, const int envc, const struct tomoyo_envp *envp, u8 *checked) { int i; struct tomoyo_path_info name; struct tomoyo_path_info value; name.name = env_name; tomoyo_fill_path_info(&name); value.name = env_value; tomoyo_fill_path_info(&value); for (i = 0; i < envc; envp++, checked++, i++) { bool result; if (!tomoyo_path_matches_pattern(&name, envp->name)) continue; *checked = 1; if (envp->value) { result = tomoyo_path_matches_pattern(&value, envp->value); if (envp->is_not) result = !result; } else { result = true; if (!envp->is_not) result = !result; } if (!result) return false; } return true; } /** * tomoyo_scan_bprm - Scan "struct linux_binprm". * * @ee: Pointer to "struct tomoyo_execve". * @argc: Length of @argc. * @argv: Pointer to "struct tomoyo_argv". * @envc: Length of @envp. * @envp: Pointer to "struct tomoyo_envp". * * Returns true on success, false otherwise. */ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee, const u16 argc, const struct tomoyo_argv *argv, const u16 envc, const struct tomoyo_envp *envp) { struct linux_binprm *bprm = ee->bprm; struct tomoyo_page_dump *dump = &ee->dump; char *arg_ptr = ee->tmp; int arg_len = 0; unsigned long pos = bprm->p; int offset = pos % PAGE_SIZE; int argv_count = bprm->argc; int envp_count = bprm->envc; bool result = true; u8 local_checked[32]; u8 *checked; if (argc + envc <= sizeof(local_checked)) { checked = local_checked; memset(local_checked, 0, sizeof(local_checked)); } else { checked = kzalloc(argc + envc, GFP_NOFS); if (!checked) return false; } while (argv_count || envp_count) { if (!tomoyo_dump_page(bprm, pos, dump)) { result = false; goto out; } pos += PAGE_SIZE - offset; while (offset < PAGE_SIZE) { /* Read. */ const char *kaddr = dump->data; const unsigned char c = kaddr[offset++]; if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) { if (c == '\\') { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = '\\'; } else if (c > ' ' && c < 127) { arg_ptr[arg_len++] = c; } else { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = (c >> 6) + '0'; arg_ptr[arg_len++] = ((c >> 3) & 7) + '0'; arg_ptr[arg_len++] = (c & 7) + '0'; } } else { arg_ptr[arg_len] = '\0'; } if (c) continue; /* Check. */ if (argv_count) { if (!tomoyo_argv(bprm->argc - argv_count, arg_ptr, argc, argv, checked)) { result = false; break; } argv_count--; } else if (envp_count) { char *cp = strchr(arg_ptr, '='); if (cp) { *cp = '\0'; if (!tomoyo_envp(arg_ptr, cp + 1, envc, envp, checked + argc)) { result = false; break; } } envp_count--; } else { break; } arg_len = 0; } offset = 0; if (!result) break; } out: if (result) { int i; /* Check not-yet-checked entries. */ for (i = 0; i < argc; i++) { if (checked[i]) continue; /* * Return true only if all unchecked indexes in * bprm->argv[] are not matched. */ if (argv[i].is_not) continue; result = false; break; } for (i = 0; i < envc; envp++, i++) { if (checked[argc + i]) continue; /* * Return true only if all unchecked environ variables * in bprm->envp[] are either undefined or not matched. */ if ((!envp->value && !envp->is_not) || (envp->value && envp->is_not)) continue; result = false; break; } } if (checked != local_checked) kfree(checked); return result; } /** * tomoyo_scan_exec_realpath - Check "exec.realpath" parameter of "struct tomoyo_condition". * * @file: Pointer to "struct file". * @ptr: Pointer to "struct tomoyo_name_union". * @match: True if "exec.realpath=", false if "exec.realpath!=". * * Returns true on success, false otherwise. */ static bool tomoyo_scan_exec_realpath(struct file *file, const struct tomoyo_name_union *ptr, const bool match) { bool result; struct tomoyo_path_info exe; if (!file) return false; exe.name = tomoyo_realpath_from_path(&file->f_path); if (!exe.name) return false; tomoyo_fill_path_info(&exe); result = tomoyo_compare_name_union(&exe, ptr); kfree(exe.name); return result == match; } /** * tomoyo_get_dqword - tomoyo_get_name() for a quoted string. * * @start: String to save. * * Returns pointer to "struct tomoyo_path_info" on success, NULL otherwise. */ static const struct tomoyo_path_info *tomoyo_get_dqword(char *start) { char *cp = start + strlen(start) - 1; if (cp == start || *start++ != '"' || *cp != '"') return NULL; *cp = '\0'; if (*start && !tomoyo_correct_word(start)) return NULL; return tomoyo_get_name(start); } /** * tomoyo_parse_name_union_quoted - Parse a quoted word. * * @param: Pointer to "struct tomoyo_acl_param". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_name_union_quoted(struct tomoyo_acl_param *param, struct tomoyo_name_union *ptr) { char *filename = param->data; if (*filename == '@') return tomoyo_parse_name_union(param, ptr); ptr->filename = tomoyo_get_dqword(filename); return ptr->filename != NULL; } /** * tomoyo_parse_argv - Parse an argv[] condition part. * * @left: Lefthand value. * @right: Righthand value. * @argv: Pointer to "struct tomoyo_argv". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_argv(char *left, char *right, struct tomoyo_argv *argv) { if (tomoyo_parse_ulong(&argv->index, &left) != TOMOYO_VALUE_TYPE_DECIMAL || *left++ != ']' || *left) return false; argv->value = tomoyo_get_dqword(right); return argv->value != NULL; } /** * tomoyo_parse_envp - Parse an envp[] condition part. * * @left: Lefthand value. * @right: Righthand value. * @envp: Pointer to "struct tomoyo_envp". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_envp(char *left, char *right, struct tomoyo_envp *envp) { const struct tomoyo_path_info *name; const struct tomoyo_path_info *value; char *cp = left + strlen(left) - 1; if (*cp-- != ']' || *cp != '"') goto out; *cp = '\0'; if (!tomoyo_correct_word(left)) goto out; name = tomoyo_get_name(left); if (!name) goto out; if (!strcmp(right, "NULL")) { value = NULL; } else { value = tomoyo_get_dqword(right); if (!value) { tomoyo_put_name(name); goto out; } } envp->name = name; envp->value = value; return true; out: return false; } /** * tomoyo_same_condition - Check for duplicated "struct tomoyo_condition" entry. * * @a: Pointer to "struct tomoyo_condition". * @b: Pointer to "struct tomoyo_condition". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_condition(const struct tomoyo_condition *a, const struct tomoyo_condition *b) { return a->size == b->size && a->condc == b->condc && a->numbers_count == b->numbers_count && a->names_count == b->names_count && a->argc == b->argc && a->envc == b->envc && a->grant_log == b->grant_log && a->transit == b->transit && !memcmp(a + 1, b + 1, a->size - sizeof(*a)); } /** * tomoyo_condition_type - Get condition type. * * @word: Keyword string. * * Returns one of values in "enum tomoyo_conditions_index" on success, * TOMOYO_MAX_CONDITION_KEYWORD otherwise. */ static u8 tomoyo_condition_type(const char *word) { u8 i; for (i = 0; i < TOMOYO_MAX_CONDITION_KEYWORD; i++) { if (!strcmp(word, tomoyo_condition_keyword[i])) break; } return i; } /* Define this to enable debug mode. */ /* #define DEBUG_CONDITION */ #ifdef DEBUG_CONDITION #define dprintk printk #else #define dprintk(...) do { } while (0) #endif /** * tomoyo_commit_condition - Commit "struct tomoyo_condition". * * @entry: Pointer to "struct tomoyo_condition". * * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise. * * This function merges duplicated entries. This function returns NULL if * @entry is not duplicated but memory quota for policy has exceeded. */ static struct tomoyo_condition *tomoyo_commit_condition (struct tomoyo_condition *entry) { struct tomoyo_condition *ptr; bool found = false; if (mutex_lock_interruptible(&tomoyo_policy_lock)) { dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); ptr = NULL; found = true; goto out; } list_for_each_entry(ptr, &tomoyo_condition_list, head.list) { if (!tomoyo_same_condition(ptr, entry) || atomic_read(&ptr->head.users) == TOMOYO_GC_IN_PROGRESS) continue; /* Same entry found. Share this entry. */ atomic_inc(&ptr->head.users); found = true; break; } if (!found) { if (tomoyo_memory_ok(entry)) { atomic_set(&entry->head.users, 1); list_add(&entry->head.list, &tomoyo_condition_list); } else { found = true; ptr = NULL; } } mutex_unlock(&tomoyo_policy_lock); out: if (found) { tomoyo_del_condition(&entry->head.list); kfree(entry); entry = ptr; } return entry; } /** * tomoyo_get_transit_preference - Parse domain transition preference for execve(). * * @param: Pointer to "struct tomoyo_acl_param". * @e: Pointer to "struct tomoyo_condition". * * Returns the condition string part. */ static char *tomoyo_get_transit_preference(struct tomoyo_acl_param *param, struct tomoyo_condition *e) { char * const pos = param->data; bool flag; if (*pos == '<') { e->transit = tomoyo_get_domainname(param); goto done; } { char *cp = strchr(pos, ' '); if (cp) *cp = '\0'; flag = tomoyo_correct_path(pos) || !strcmp(pos, "keep") || !strcmp(pos, "initialize") || !strcmp(pos, "reset") || !strcmp(pos, "child") || !strcmp(pos, "parent"); if (cp) *cp = ' '; } if (!flag) return pos; e->transit = tomoyo_get_name(tomoyo_read_token(param)); done: if (e->transit) return param->data; /* * Return a bad read-only condition string that will let * tomoyo_get_condition() return NULL. */ return "/"; } /** * tomoyo_get_condition - Parse condition part. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise. */ struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param) { struct tomoyo_condition *entry = NULL; struct tomoyo_condition_element *condp = NULL; struct tomoyo_number_union *numbers_p = NULL; struct tomoyo_name_union *names_p = NULL; struct tomoyo_argv *argv = NULL; struct tomoyo_envp *envp = NULL; struct tomoyo_condition e = { }; char * const start_of_string = tomoyo_get_transit_preference(param, &e); char * const end_of_string = start_of_string + strlen(start_of_string); char *pos; rerun: pos = start_of_string; while (1) { u8 left = -1; u8 right = -1; char *left_word = pos; char *cp; char *right_word; bool is_not; if (!*left_word) break; /* * Since left-hand condition does not allow use of "path_group" * or "number_group" and environment variable's names do not * accept '=', it is guaranteed that the original line consists * of one or more repetition of $left$operator$right blocks * where "$left is free from '=' and ' '" and "$operator is * either '=' or '!='" and "$right is free from ' '". * Therefore, we can reconstruct the original line at the end * of dry run even if we overwrite $operator with '\0'. */ cp = strchr(pos, ' '); if (cp) { *cp = '\0'; /* Will restore later. */ pos = cp + 1; } else { pos = ""; } right_word = strchr(left_word, '='); if (!right_word || right_word == left_word) goto out; is_not = *(right_word - 1) == '!'; if (is_not) *(right_word++ - 1) = '\0'; /* Will restore later. */ else if (*(right_word + 1) != '=') *right_word++ = '\0'; /* Will restore later. */ else goto out; dprintk(KERN_WARNING "%u: <%s>%s=<%s>\n", __LINE__, left_word, is_not ? "!" : "", right_word); if (!strcmp(left_word, "grant_log")) { if (entry) { if (is_not || entry->grant_log != TOMOYO_GRANTLOG_AUTO) goto out; else if (!strcmp(right_word, "yes")) entry->grant_log = TOMOYO_GRANTLOG_YES; else if (!strcmp(right_word, "no")) entry->grant_log = TOMOYO_GRANTLOG_NO; else goto out; } continue; } if (!strncmp(left_word, "exec.argv[", 10)) { if (!argv) { e.argc++; e.condc++; } else { e.argc--; e.condc--; left = TOMOYO_ARGV_ENTRY; argv->is_not = is_not; if (!tomoyo_parse_argv(left_word + 10, right_word, argv++)) goto out; } goto store_value; } if (!strncmp(left_word, "exec.envp[\"", 11)) { if (!envp) { e.envc++; e.condc++; } else { e.envc--; e.condc--; left = TOMOYO_ENVP_ENTRY; envp->is_not = is_not; if (!tomoyo_parse_envp(left_word + 11, right_word, envp++)) goto out; } goto store_value; } left = tomoyo_condition_type(left_word); dprintk(KERN_WARNING "%u: <%s> left=%u\n", __LINE__, left_word, left); if (left == TOMOYO_MAX_CONDITION_KEYWORD) { if (!numbers_p) { e.numbers_count++; } else { e.numbers_count--; left = TOMOYO_NUMBER_UNION; param->data = left_word; if (*left_word == '@' || !tomoyo_parse_number_union(param, numbers_p++)) goto out; } } if (!condp) e.condc++; else e.condc--; if (left == TOMOYO_EXEC_REALPATH || left == TOMOYO_SYMLINK_TARGET) { if (!names_p) { e.names_count++; } else { e.names_count--; right = TOMOYO_NAME_UNION; param->data = right_word; if (!tomoyo_parse_name_union_quoted(param, names_p++)) goto out; } goto store_value; } right = tomoyo_condition_type(right_word); if (right == TOMOYO_MAX_CONDITION_KEYWORD) { if (!numbers_p) { e.numbers_count++; } else { e.numbers_count--; right = TOMOYO_NUMBER_UNION; param->data = right_word; if (!tomoyo_parse_number_union(param, numbers_p++)) goto out; } } store_value: if (!condp) { dprintk(KERN_WARNING "%u: dry_run left=%u right=%u match=%u\n", __LINE__, left, right, !is_not); continue; } condp->left = left; condp->right = right; condp->equals = !is_not; dprintk(KERN_WARNING "%u: left=%u right=%u match=%u\n", __LINE__, condp->left, condp->right, condp->equals); condp++; } dprintk(KERN_INFO "%u: cond=%u numbers=%u names=%u ac=%u ec=%u\n", __LINE__, e.condc, e.numbers_count, e.names_count, e.argc, e.envc); if (entry) { BUG_ON(e.names_count | e.numbers_count | e.argc | e.envc | e.condc); return tomoyo_commit_condition(entry); } e.size = sizeof(*entry) + e.condc * sizeof(struct tomoyo_condition_element) + e.numbers_count * sizeof(struct tomoyo_number_union) + e.names_count * sizeof(struct tomoyo_name_union) + e.argc * sizeof(struct tomoyo_argv) + e.envc * sizeof(struct tomoyo_envp); entry = kzalloc(e.size, GFP_NOFS); if (!entry) goto out2; *entry = e; e.transit = NULL; condp = (struct tomoyo_condition_element *) (entry + 1); numbers_p = (struct tomoyo_number_union *) (condp + e.condc); names_p = (struct tomoyo_name_union *) (numbers_p + e.numbers_count); argv = (struct tomoyo_argv *) (names_p + e.names_count); envp = (struct tomoyo_envp *) (argv + e.argc); { bool flag = false; for (pos = start_of_string; pos < end_of_string; pos++) { if (*pos) continue; if (flag) /* Restore " ". */ *pos = ' '; else if (*(pos + 1) == '=') /* Restore "!=". */ *pos = '!'; else /* Restore "=". */ *pos = '='; flag = !flag; } } goto rerun; out: dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); if (entry) { tomoyo_del_condition(&entry->head.list); kfree(entry); } out2: tomoyo_put_name(e.transit); return NULL; } /** * tomoyo_get_attributes - Revalidate "struct inode". * * @obj: Pointer to "struct tomoyo_obj_info". * * Returns nothing. */ void tomoyo_get_attributes(struct tomoyo_obj_info *obj) { u8 i; struct dentry *dentry = NULL; for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) { struct inode *inode; switch (i) { case TOMOYO_PATH1: dentry = obj->path1.dentry; if (!dentry) continue; break; case TOMOYO_PATH2: dentry = obj->path2.dentry; if (!dentry) continue; break; default: if (!dentry) continue; dentry = dget_parent(dentry); break; } inode = d_backing_inode(dentry); if (inode) { struct tomoyo_mini_stat *stat = &obj->stat[i]; stat->uid = inode->i_uid; stat->gid = inode->i_gid; stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->dev = inode->i_sb->s_dev; stat->rdev = inode->i_rdev; obj->stat_valid[i] = true; } if (i & 1) /* TOMOYO_PATH1_PARENT or TOMOYO_PATH2_PARENT */ dput(dentry); } } /** * tomoyo_condition - Check condition part. * * @r: Pointer to "struct tomoyo_request_info". * @cond: Pointer to "struct tomoyo_condition". Maybe NULL. * * Returns true on success, false otherwise. * * Caller holds tomoyo_read_lock(). */ bool tomoyo_condition(struct tomoyo_request_info *r, const struct tomoyo_condition *cond) { u32 i; unsigned long min_v[2] = { 0, 0 }; unsigned long max_v[2] = { 0, 0 }; const struct tomoyo_condition_element *condp; const struct tomoyo_number_union *numbers_p; const struct tomoyo_name_union *names_p; const struct tomoyo_argv *argv; const struct tomoyo_envp *envp; struct tomoyo_obj_info *obj; u16 condc; u16 argc; u16 envc; struct linux_binprm *bprm = NULL; if (!cond) return true; condc = cond->condc; argc = cond->argc; envc = cond->envc; obj = r->obj; if (r->ee) bprm = r->ee->bprm; if (!bprm && (argc || envc)) return false; condp = (struct tomoyo_condition_element *) (cond + 1); numbers_p = (const struct tomoyo_number_union *) (condp + condc); names_p = (const struct tomoyo_name_union *) (numbers_p + cond->numbers_count); argv = (const struct tomoyo_argv *) (names_p + cond->names_count); envp = (const struct tomoyo_envp *) (argv + argc); for (i = 0; i < condc; i++) { const bool match = condp->equals; const u8 left = condp->left; const u8 right = condp->right; bool is_bitop[2] = { false, false }; u8 j; condp++; /* Check argv[] and envp[] later. */ if (left == TOMOYO_ARGV_ENTRY || left == TOMOYO_ENVP_ENTRY) continue; /* Check string expressions. */ if (right == TOMOYO_NAME_UNION) { const struct tomoyo_name_union *ptr = names_p++; struct tomoyo_path_info *symlink; struct tomoyo_execve *ee; struct file *file; switch (left) { case TOMOYO_SYMLINK_TARGET: symlink = obj ? obj->symlink_target : NULL; if (!symlink || !tomoyo_compare_name_union(symlink, ptr) == match) goto out; break; case TOMOYO_EXEC_REALPATH: ee = r->ee; file = ee ? ee->bprm->file : NULL; if (!tomoyo_scan_exec_realpath(file, ptr, match)) goto out; break; } continue; } /* Check numeric or bit-op expressions. */ for (j = 0; j < 2; j++) { const u8 index = j ? right : left; unsigned long value = 0; switch (index) { case TOMOYO_TASK_UID: value = from_kuid(&init_user_ns, current_uid()); break; case TOMOYO_TASK_EUID: value = from_kuid(&init_user_ns, current_euid()); break; case TOMOYO_TASK_SUID: value = from_kuid(&init_user_ns, current_suid()); break; case TOMOYO_TASK_FSUID: value = from_kuid(&init_user_ns, current_fsuid()); break; case TOMOYO_TASK_GID: value = from_kgid(&init_user_ns, current_gid()); break; case TOMOYO_TASK_EGID: value = from_kgid(&init_user_ns, current_egid()); break; case TOMOYO_TASK_SGID: value = from_kgid(&init_user_ns, current_sgid()); break; case TOMOYO_TASK_FSGID: value = from_kgid(&init_user_ns, current_fsgid()); break; case TOMOYO_TASK_PID: value = tomoyo_sys_getpid(); break; case TOMOYO_TASK_PPID: value = tomoyo_sys_getppid(); break; case TOMOYO_TYPE_IS_SOCKET: value = S_IFSOCK; break; case TOMOYO_TYPE_IS_SYMLINK: value = S_IFLNK; break; case TOMOYO_TYPE_IS_FILE: value = S_IFREG; break; case TOMOYO_TYPE_IS_BLOCK_DEV: value = S_IFBLK; break; case TOMOYO_TYPE_IS_DIRECTORY: value = S_IFDIR; break; case TOMOYO_TYPE_IS_CHAR_DEV: value = S_IFCHR; break; case TOMOYO_TYPE_IS_FIFO: value = S_IFIFO; break; case TOMOYO_MODE_SETUID: value = S_ISUID; break; case TOMOYO_MODE_SETGID: value = S_ISGID; break; case TOMOYO_MODE_STICKY: value = S_ISVTX; break; case TOMOYO_MODE_OWNER_READ: value = 0400; break; case TOMOYO_MODE_OWNER_WRITE: value = 0200; break; case TOMOYO_MODE_OWNER_EXECUTE: value = 0100; break; case TOMOYO_MODE_GROUP_READ: value = 0040; break; case TOMOYO_MODE_GROUP_WRITE: value = 0020; break; case TOMOYO_MODE_GROUP_EXECUTE: value = 0010; break; case TOMOYO_MODE_OTHERS_READ: value = 0004; break; case TOMOYO_MODE_OTHERS_WRITE: value = 0002; break; case TOMOYO_MODE_OTHERS_EXECUTE: value = 0001; break; case TOMOYO_EXEC_ARGC: if (!bprm) goto out; value = bprm->argc; break; case TOMOYO_EXEC_ENVC: if (!bprm) goto out; value = bprm->envc; break; case TOMOYO_NUMBER_UNION: /* Fetch values later. */ break; default: if (!obj) goto out; if (!obj->validate_done) { tomoyo_get_attributes(obj); obj->validate_done = true; } { u8 stat_index; struct tomoyo_mini_stat *stat; switch (index) { case TOMOYO_PATH1_UID: case TOMOYO_PATH1_GID: case TOMOYO_PATH1_INO: case TOMOYO_PATH1_MAJOR: case TOMOYO_PATH1_MINOR: case TOMOYO_PATH1_TYPE: case TOMOYO_PATH1_DEV_MAJOR: case TOMOYO_PATH1_DEV_MINOR: case TOMOYO_PATH1_PERM: stat_index = TOMOYO_PATH1; break; case TOMOYO_PATH2_UID: case TOMOYO_PATH2_GID: case TOMOYO_PATH2_INO: case TOMOYO_PATH2_MAJOR: case TOMOYO_PATH2_MINOR: case TOMOYO_PATH2_TYPE: case TOMOYO_PATH2_DEV_MAJOR: case TOMOYO_PATH2_DEV_MINOR: case TOMOYO_PATH2_PERM: stat_index = TOMOYO_PATH2; break; case TOMOYO_PATH1_PARENT_UID: case TOMOYO_PATH1_PARENT_GID: case TOMOYO_PATH1_PARENT_INO: case TOMOYO_PATH1_PARENT_PERM: stat_index = TOMOYO_PATH1_PARENT; break; case TOMOYO_PATH2_PARENT_UID: case TOMOYO_PATH2_PARENT_GID: case TOMOYO_PATH2_PARENT_INO: case TOMOYO_PATH2_PARENT_PERM: stat_index = TOMOYO_PATH2_PARENT; break; default: goto out; } if (!obj->stat_valid[stat_index]) goto out; stat = &obj->stat[stat_index]; switch (index) { case TOMOYO_PATH1_UID: case TOMOYO_PATH2_UID: case TOMOYO_PATH1_PARENT_UID: case TOMOYO_PATH2_PARENT_UID: value = from_kuid(&init_user_ns, stat->uid); break; case TOMOYO_PATH1_GID: case TOMOYO_PATH2_GID: case TOMOYO_PATH1_PARENT_GID: case TOMOYO_PATH2_PARENT_GID: value = from_kgid(&init_user_ns, stat->gid); break; case TOMOYO_PATH1_INO: case TOMOYO_PATH2_INO: case TOMOYO_PATH1_PARENT_INO: case TOMOYO_PATH2_PARENT_INO: value = stat->ino; break; case TOMOYO_PATH1_MAJOR: case TOMOYO_PATH2_MAJOR: value = MAJOR(stat->dev); break; case TOMOYO_PATH1_MINOR: case TOMOYO_PATH2_MINOR: value = MINOR(stat->dev); break; case TOMOYO_PATH1_TYPE: case TOMOYO_PATH2_TYPE: value = stat->mode & S_IFMT; break; case TOMOYO_PATH1_DEV_MAJOR: case TOMOYO_PATH2_DEV_MAJOR: value = MAJOR(stat->rdev); break; case TOMOYO_PATH1_DEV_MINOR: case TOMOYO_PATH2_DEV_MINOR: value = MINOR(stat->rdev); break; case TOMOYO_PATH1_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PARENT_PERM: value = stat->mode & S_IALLUGO; break; } } break; } max_v[j] = value; min_v[j] = value; switch (index) { case TOMOYO_MODE_SETUID: case TOMOYO_MODE_SETGID: case TOMOYO_MODE_STICKY: case TOMOYO_MODE_OWNER_READ: case TOMOYO_MODE_OWNER_WRITE: case TOMOYO_MODE_OWNER_EXECUTE: case TOMOYO_MODE_GROUP_READ: case TOMOYO_MODE_GROUP_WRITE: case TOMOYO_MODE_GROUP_EXECUTE: case TOMOYO_MODE_OTHERS_READ: case TOMOYO_MODE_OTHERS_WRITE: case TOMOYO_MODE_OTHERS_EXECUTE: is_bitop[j] = true; } } if (left == TOMOYO_NUMBER_UNION) { /* Fetch values now. */ const struct tomoyo_number_union *ptr = numbers_p++; min_v[0] = ptr->values[0]; max_v[0] = ptr->values[1]; } if (right == TOMOYO_NUMBER_UNION) { /* Fetch values now. */ const struct tomoyo_number_union *ptr = numbers_p++; if (ptr->group) { if (tomoyo_number_matches_group(min_v[0], max_v[0], ptr->group) == match) continue; } else { if ((min_v[0] <= ptr->values[1] && max_v[0] >= ptr->values[0]) == match) continue; } goto out; } /* * Bit operation is valid only when counterpart value * represents permission. */ if (is_bitop[0] && is_bitop[1]) { goto out; } else if (is_bitop[0]) { switch (right) { case TOMOYO_PATH1_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH2_PARENT_PERM: if (!(max_v[0] & max_v[1]) == !match) continue; } goto out; } else if (is_bitop[1]) { switch (left) { case TOMOYO_PATH1_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH2_PARENT_PERM: if (!(max_v[0] & max_v[1]) == !match) continue; } goto out; } /* Normal value range comparison. */ if ((min_v[0] <= max_v[1] && max_v[0] >= min_v[1]) == match) continue; out: return false; } /* Check argv[] and envp[] now. */ if (r->ee && (argc || envc)) return tomoyo_scan_bprm(r->ee, argc, argv, envc, envp); return true; }
21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_IRQDESC_H #define _LINUX_IRQDESC_H #include <linux/rcupdate.h> #include <linux/kobject.h> #include <linux/mutex.h> /* * Core internal functions to deal with irq descriptors */ struct irq_affinity_notify; struct proc_dir_entry; struct module; struct irq_desc; struct irq_domain; struct pt_regs; /** * struct irqstat - interrupt statistics * @cnt: real-time interrupt count * @ref: snapshot of interrupt count */ struct irqstat { unsigned int cnt; #ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT unsigned int ref; #endif }; /** * struct irq_desc - interrupt descriptor * @irq_common_data: per irq and chip data passed down to chip functions * @kstat_irqs: irq stats per cpu * @handle_irq: highlevel irq-events handler * @action: the irq action chain * @status_use_accessors: status information * @core_internal_state__do_not_mess_with_it: core internal status information * @depth: disable-depth, for nested irq_disable() calls * @wake_depth: enable depth, for multiple irq_set_irq_wake() callers * @tot_count: stats field for non-percpu irqs * @irq_count: stats field to detect stalled irqs * @last_unhandled: aging timer for unhandled count * @irqs_unhandled: stats field for spurious unhandled interrupts * @threads_handled: stats field for deferred spurious detection of threaded handlers * @threads_handled_last: comparator field for deferred spurious detection of threaded handlers * @lock: locking for SMP * @affinity_hint: hint to user space for preferred irq affinity * @affinity_notify: context for notification of affinity changes * @pending_mask: pending rebalanced interrupts * @threads_oneshot: bitfield to handle shared oneshot threads * @threads_active: number of irqaction threads currently running * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers * @nr_actions: number of installed actions on this descriptor * @no_suspend_depth: number of irqactions on a irq descriptor with * IRQF_NO_SUSPEND set * @force_resume_depth: number of irqactions on a irq descriptor with * IRQF_FORCE_RESUME set * @rcu: rcu head for delayed free * @kobj: kobject used to represent this struct in sysfs * @request_mutex: mutex to protect request/free before locking desc->lock * @dir: /proc/irq/ procfs entry * @debugfs_file: dentry for the debugfs file * @name: flow handler name for /proc/interrupts output */ struct irq_desc { struct irq_common_data irq_common_data; struct irq_data irq_data; struct irqstat __percpu *kstat_irqs; irq_flow_handler_t handle_irq; struct irqaction *action; /* IRQ action list */ unsigned int status_use_accessors; unsigned int core_internal_state__do_not_mess_with_it; unsigned int depth; /* nested irq disables */ unsigned int wake_depth; /* nested wake enables */ unsigned int tot_count; unsigned int irq_count; /* For detecting broken IRQs */ unsigned long last_unhandled; /* Aging timer for unhandled count */ unsigned int irqs_unhandled; atomic_t threads_handled; int threads_handled_last; raw_spinlock_t lock; struct cpumask *percpu_enabled; const struct cpumask *percpu_affinity; #ifdef CONFIG_SMP const struct cpumask *affinity_hint; struct irq_affinity_notify *affinity_notify; #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_var_t pending_mask; #endif #endif unsigned long threads_oneshot; atomic_t threads_active; wait_queue_head_t wait_for_threads; #ifdef CONFIG_PM_SLEEP unsigned int nr_actions; unsigned int no_suspend_depth; unsigned int cond_suspend_depth; unsigned int force_resume_depth; #endif #ifdef CONFIG_PROC_FS struct proc_dir_entry *dir; #endif #ifdef CONFIG_GENERIC_IRQ_DEBUGFS struct dentry *debugfs_file; const char *dev_name; #endif #ifdef CONFIG_SPARSE_IRQ struct rcu_head rcu; struct kobject kobj; #endif struct mutex request_mutex; int parent_irq; struct module *owner; const char *name; #ifdef CONFIG_HARDIRQS_SW_RESEND struct hlist_node resend_node; #endif } ____cacheline_internodealigned_in_smp; #ifdef CONFIG_SPARSE_IRQ extern void irq_lock_sparse(void); extern void irq_unlock_sparse(void); #else static inline void irq_lock_sparse(void) { } static inline void irq_unlock_sparse(void) { } extern struct irq_desc irq_desc[NR_IRQS]; #endif static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc, unsigned int cpu) { return desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } static inline struct irq_desc *irq_data_to_desc(struct irq_data *data) { return container_of(data->common, struct irq_desc, irq_common_data); } static inline unsigned int irq_desc_get_irq(struct irq_desc *desc) { return desc->irq_data.irq; } static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc) { return &desc->irq_data; } static inline struct irq_chip *irq_desc_get_chip(struct irq_desc *desc) { return desc->irq_data.chip; } static inline void *irq_desc_get_chip_data(struct irq_desc *desc) { return desc->irq_data.chip_data; } static inline void *irq_desc_get_handler_data(struct irq_desc *desc) { return desc->irq_common_data.handler_data; } /* * Architectures call this to let the generic IRQ layer * handle an interrupt. */ static inline void generic_handle_irq_desc(struct irq_desc *desc) { desc->handle_irq(desc); } int handle_irq_desc(struct irq_desc *desc); int generic_handle_irq(unsigned int irq); int generic_handle_irq_safe(unsigned int irq); #ifdef CONFIG_IRQ_DOMAIN /* * Convert a HW interrupt number to a logical one using a IRQ domain, * and handle the result interrupt number. Return -EINVAL if * conversion failed. */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq); int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq); int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq); #endif /* Test to see if a driver has successfully requested an irq */ static inline int irq_desc_has_action(struct irq_desc *desc) { return desc && desc->action != NULL; } /** * irq_set_handler_locked - Set irq handler from a locked region * @data: Pointer to the irq_data structure which identifies the irq * @handler: Flow control handler function for this interrupt * * Sets the handler in the irq descriptor associated to @data. * * Must be called with irq_desc locked and valid parameters. Typical * call site is the irq_set_type() callback. */ static inline void irq_set_handler_locked(struct irq_data *data, irq_flow_handler_t handler) { struct irq_desc *desc = irq_data_to_desc(data); desc->handle_irq = handler; } /** * irq_set_chip_handler_name_locked - Set chip, handler and name from a locked region * @data: Pointer to the irq_data structure for which the chip is set * @chip: Pointer to the new irq chip * @handler: Flow control handler function for this interrupt * @name: Name of the interrupt * * Replace the irq chip at the proper hierarchy level in @data and * sets the handler and name in the associated irq descriptor. * * Must be called with irq_desc locked and valid parameters. */ static inline void irq_set_chip_handler_name_locked(struct irq_data *data, const struct irq_chip *chip, irq_flow_handler_t handler, const char *name) { struct irq_desc *desc = irq_data_to_desc(data); desc->handle_irq = handler; desc->name = name; data->chip = (struct irq_chip *)chip; } bool irq_check_status_bit(unsigned int irq, unsigned int bitmask); static inline bool irq_balancing_disabled(unsigned int irq) { return irq_check_status_bit(irq, IRQ_NO_BALANCING_MASK); } static inline bool irq_is_percpu(unsigned int irq) { return irq_check_status_bit(irq, IRQ_PER_CPU); } static inline bool irq_is_percpu_devid(unsigned int irq) { return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID); } void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, struct lock_class_key *request_class); static inline void irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, struct lock_class_key *request_class) { if (IS_ENABLED(CONFIG_LOCKDEP)) __irq_set_lockdep_class(irq, lock_class, request_class); } #endif
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Declarations for error reporting tracepoints. * * Copyright (C) 2021, Google LLC. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM error_report #if !defined(_TRACE_ERROR_REPORT_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_ERROR_REPORT_H #include <linux/tracepoint.h> #ifndef __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY #define __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY enum error_detector { ERROR_DETECTOR_KFENCE, ERROR_DETECTOR_KASAN, ERROR_DETECTOR_WARN, }; #endif /* __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY */ #define error_detector_list \ EM(ERROR_DETECTOR_KFENCE, "kfence") \ EM(ERROR_DETECTOR_KASAN, "kasan") \ EMe(ERROR_DETECTOR_WARN, "warning") /* Always end the list with an EMe. */ #undef EM #undef EMe #define EM(a, b) TRACE_DEFINE_ENUM(a); #define EMe(a, b) TRACE_DEFINE_ENUM(a); error_detector_list #undef EM #undef EMe #define EM(a, b) { a, b }, #define EMe(a, b) { a, b } #define show_error_detector_list(val) \ __print_symbolic(val, error_detector_list) DECLARE_EVENT_CLASS(error_report_template, TP_PROTO(enum error_detector error_detector, unsigned long id), TP_ARGS(error_detector, id), TP_STRUCT__entry(__field(enum error_detector, error_detector) __field(unsigned long, id)), TP_fast_assign(__entry->error_detector = error_detector; __entry->id = id;), TP_printk("[%s] %lx", show_error_detector_list(__entry->error_detector), __entry->id)); /** * error_report_end - called after printing the error report * @error_detector: short string describing the error detection tool * @id: pseudo-unique descriptor identifying the report * (e.g. the memory access address) * * This event occurs right after a debugging tool finishes printing the error * report. */ DEFINE_EVENT(error_report_template, error_report_end, TP_PROTO(enum error_detector error_detector, unsigned long id), TP_ARGS(error_detector, id)); #endif /* _TRACE_ERROR_REPORT_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
6 1 1 6 73 74 6 74 21 16 8 15 7 11 6 1 6 13 13 6 81 82 1 7 63 63 63 6 81 82 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Ptrace hooks * * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2019-2020 ANSSI */ #include <asm/current.h> #include <linux/cred.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/lsm_hooks.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <net/af_unix.h> #include <net/sock.h> #include "common.h" #include "cred.h" #include "fs.h" #include "ruleset.h" #include "setup.h" #include "task.h" /** * domain_scope_le - Checks domain ordering for scoped ptrace * * @parent: Parent domain. * @child: Potential child of @parent. * * Checks if the @parent domain is less or equal to (i.e. an ancestor, which * means a subset of) the @child domain. */ static bool domain_scope_le(const struct landlock_ruleset *const parent, const struct landlock_ruleset *const child) { const struct landlock_hierarchy *walker; if (!parent) return true; if (!child) return false; for (walker = child->hierarchy; walker; walker = walker->parent) { if (walker == parent->hierarchy) /* @parent is in the scoped hierarchy of @child. */ return true; } /* There is no relationship between @parent and @child. */ return false; } static bool task_is_scoped(const struct task_struct *const parent, const struct task_struct *const child) { bool is_scoped; const struct landlock_ruleset *dom_parent, *dom_child; rcu_read_lock(); dom_parent = landlock_get_task_domain(parent); dom_child = landlock_get_task_domain(child); is_scoped = domain_scope_le(dom_parent, dom_child); rcu_read_unlock(); return is_scoped; } static int task_ptrace(const struct task_struct *const parent, const struct task_struct *const child) { /* Quick return for non-landlocked tasks. */ if (!landlocked(parent)) return 0; if (task_is_scoped(parent, child)) return 0; return -EPERM; } /** * hook_ptrace_access_check - Determines whether the current process may access * another * * @child: Process to be accessed. * @mode: Mode of attachment. * * If the current task has Landlock rules, then the child must have at least * the same rules. Else denied. * * Determines whether a process may access another, returning 0 if permission * granted, -errno if denied. */ static int hook_ptrace_access_check(struct task_struct *const child, const unsigned int mode) { return task_ptrace(current, child); } /** * hook_ptrace_traceme - Determines whether another process may trace the * current one * * @parent: Task proposed to be the tracer. * * If the parent has Landlock rules, then the current task must have the same * or more rules. Else denied. * * Determines whether the nominated task is permitted to trace the current * process, returning 0 if permission is granted, -errno if denied. */ static int hook_ptrace_traceme(struct task_struct *const parent) { return task_ptrace(parent, current); } /** * domain_is_scoped - Checks if the client domain is scoped in the same * domain as the server. * * @client: IPC sender domain. * @server: IPC receiver domain. * @scope: The scope restriction criteria. * * Returns: True if the @client domain is scoped to access the @server, * unless the @server is also scoped in the same domain as @client. */ static bool domain_is_scoped(const struct landlock_ruleset *const client, const struct landlock_ruleset *const server, access_mask_t scope) { int client_layer, server_layer; struct landlock_hierarchy *client_walker, *server_walker; /* Quick return if client has no domain */ if (WARN_ON_ONCE(!client)) return false; client_layer = client->num_layers - 1; client_walker = client->hierarchy; /* * client_layer must be a signed integer with greater capacity * than client->num_layers to ensure the following loop stops. */ BUILD_BUG_ON(sizeof(client_layer) > sizeof(client->num_layers)); server_layer = server ? (server->num_layers - 1) : -1; server_walker = server ? server->hierarchy : NULL; /* * Walks client's parent domains down to the same hierarchy level * as the server's domain, and checks that none of these client's * parent domains are scoped. */ for (; client_layer > server_layer; client_layer--) { if (landlock_get_scope_mask(client, client_layer) & scope) return true; client_walker = client_walker->parent; } /* * Walks server's parent domains down to the same hierarchy level as * the client's domain. */ for (; server_layer > client_layer; server_layer--) server_walker = server_walker->parent; for (; client_layer >= 0; client_layer--) { if (landlock_get_scope_mask(client, client_layer) & scope) { /* * Client and server are at the same level in the * hierarchy. If the client is scoped, the request is * only allowed if this domain is also a server's * ancestor. */ return server_walker != client_walker; } client_walker = client_walker->parent; server_walker = server_walker->parent; } return false; } static bool sock_is_scoped(struct sock *const other, const struct landlock_ruleset *const domain) { const struct landlock_ruleset *dom_other; /* The credentials will not change. */ lockdep_assert_held(&unix_sk(other)->lock); dom_other = landlock_cred(other->sk_socket->file->f_cred)->domain; return domain_is_scoped(domain, dom_other, LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET); } static bool is_abstract_socket(struct sock *const sock) { struct unix_address *addr = unix_sk(sock)->addr; if (!addr) return false; if (addr->len >= offsetof(struct sockaddr_un, sun_path) + 1 && addr->name->sun_path[0] == '\0') return true; return false; } static const struct access_masks unix_scope = { .scope = LANDLOCK_SCOPE_ABSTRACT_UNIX_SOCKET, }; static int hook_unix_stream_connect(struct sock *const sock, struct sock *const other, struct sock *const newsk) { const struct landlock_ruleset *const dom = landlock_get_applicable_domain(landlock_get_current_domain(), unix_scope); /* Quick return for non-landlocked tasks. */ if (!dom) return 0; if (is_abstract_socket(other) && sock_is_scoped(other, dom)) return -EPERM; return 0; } static int hook_unix_may_send(struct socket *const sock, struct socket *const other) { const struct landlock_ruleset *const dom = landlock_get_applicable_domain(landlock_get_current_domain(), unix_scope); if (!dom) return 0; /* * Checks if this datagram socket was already allowed to be connected * to other. */ if (unix_peer(sock->sk) == other->sk) return 0; if (is_abstract_socket(other->sk) && sock_is_scoped(other->sk, dom)) return -EPERM; return 0; } static const struct access_masks signal_scope = { .scope = LANDLOCK_SCOPE_SIGNAL, }; static int hook_task_kill(struct task_struct *const p, struct kernel_siginfo *const info, const int sig, const struct cred *const cred) { bool is_scoped; const struct landlock_ruleset *dom; if (cred) { /* Dealing with USB IO. */ dom = landlock_cred(cred)->domain; } else { dom = landlock_get_current_domain(); } dom = landlock_get_applicable_domain(dom, signal_scope); /* Quick return for non-landlocked tasks. */ if (!dom) return 0; rcu_read_lock(); is_scoped = domain_is_scoped(dom, landlock_get_task_domain(p), LANDLOCK_SCOPE_SIGNAL); rcu_read_unlock(); if (is_scoped) return -EPERM; return 0; } static int hook_file_send_sigiotask(struct task_struct *tsk, struct fown_struct *fown, int signum) { const struct landlock_ruleset *dom; bool is_scoped = false; /* Lock already held by send_sigio() and send_sigurg(). */ lockdep_assert_held(&fown->lock); dom = landlock_get_applicable_domain( landlock_file(fown->file)->fown_domain, signal_scope); /* Quick return for unowned socket. */ if (!dom) return 0; rcu_read_lock(); is_scoped = domain_is_scoped(dom, landlock_get_task_domain(tsk), LANDLOCK_SCOPE_SIGNAL); rcu_read_unlock(); if (is_scoped) return -EPERM; return 0; } static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, hook_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, hook_ptrace_traceme), LSM_HOOK_INIT(unix_stream_connect, hook_unix_stream_connect), LSM_HOOK_INIT(unix_may_send, hook_unix_may_send), LSM_HOOK_INIT(task_kill, hook_task_kill), LSM_HOOK_INIT(file_send_sigiotask, hook_file_send_sigiotask), }; __init void landlock_add_task_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), &landlock_lsmid); }
2 2 3 2 3 3 3 2 1 2 3 2 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 // SPDX-License-Identifier: GPL-2.0 /* * Fintek F81232 USB to serial adaptor driver * Fintek F81532A/534A/535/536 USB to 2/4/8/12 serial adaptor driver * * Copyright (C) 2012 Greg Kroah-Hartman (gregkh@linuxfoundation.org) * Copyright (C) 2012 Linux Foundation */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/serial.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/mutex.h> #include <linux/uaccess.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/serial_reg.h> #define F81232_ID \ { USB_DEVICE(0x1934, 0x0706) } /* 1 port UART device */ #define F81534A_SERIES_ID \ { USB_DEVICE(0x2c42, 0x1602) }, /* In-Box 2 port UART device */ \ { USB_DEVICE(0x2c42, 0x1604) }, /* In-Box 4 port UART device */ \ { USB_DEVICE(0x2c42, 0x1605) }, /* In-Box 8 port UART device */ \ { USB_DEVICE(0x2c42, 0x1606) }, /* In-Box 12 port UART device */ \ { USB_DEVICE(0x2c42, 0x1608) }, /* Non-Flash type */ \ { USB_DEVICE(0x2c42, 0x1632) }, /* 2 port UART device */ \ { USB_DEVICE(0x2c42, 0x1634) }, /* 4 port UART device */ \ { USB_DEVICE(0x2c42, 0x1635) }, /* 8 port UART device */ \ { USB_DEVICE(0x2c42, 0x1636) } /* 12 port UART device */ #define F81534A_CTRL_ID \ { USB_DEVICE(0x2c42, 0x16f8) } /* Global control device */ static const struct usb_device_id f81232_id_table[] = { F81232_ID, { } /* Terminating entry */ }; static const struct usb_device_id f81534a_id_table[] = { F81534A_SERIES_ID, { } /* Terminating entry */ }; static const struct usb_device_id f81534a_ctrl_id_table[] = { F81534A_CTRL_ID, { } /* Terminating entry */ }; static const struct usb_device_id combined_id_table[] = { F81232_ID, F81534A_SERIES_ID, F81534A_CTRL_ID, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, combined_id_table); /* Maximum baudrate for F81232 */ #define F81232_MAX_BAUDRATE 1500000 #define F81232_DEF_BAUDRATE 9600 /* USB Control EP parameter */ #define F81232_REGISTER_REQUEST 0xa0 #define F81232_GET_REGISTER 0xc0 #define F81232_SET_REGISTER 0x40 #define F81534A_ACCESS_REG_RETRY 2 #define SERIAL_BASE_ADDRESS 0x0120 #define RECEIVE_BUFFER_REGISTER (0x00 + SERIAL_BASE_ADDRESS) #define INTERRUPT_ENABLE_REGISTER (0x01 + SERIAL_BASE_ADDRESS) #define FIFO_CONTROL_REGISTER (0x02 + SERIAL_BASE_ADDRESS) #define LINE_CONTROL_REGISTER (0x03 + SERIAL_BASE_ADDRESS) #define MODEM_CONTROL_REGISTER (0x04 + SERIAL_BASE_ADDRESS) #define LINE_STATUS_REGISTER (0x05 + SERIAL_BASE_ADDRESS) #define MODEM_STATUS_REGISTER (0x06 + SERIAL_BASE_ADDRESS) /* * F81232 Clock registers (106h) * * Bit1-0: Clock source selector * 00: 1.846MHz. * 01: 18.46MHz. * 10: 24MHz. * 11: 14.77MHz. */ #define F81232_CLK_REGISTER 0x106 #define F81232_CLK_1_846_MHZ 0 #define F81232_CLK_18_46_MHZ BIT(0) #define F81232_CLK_24_MHZ BIT(1) #define F81232_CLK_14_77_MHZ (BIT(1) | BIT(0)) #define F81232_CLK_MASK GENMASK(1, 0) #define F81534A_MODE_REG 0x107 #define F81534A_TRIGGER_MASK GENMASK(3, 2) #define F81534A_TRIGGER_MULTIPLE_4X BIT(3) #define F81534A_FIFO_128BYTE (BIT(1) | BIT(0)) /* Serial port self GPIO control, 2bytes [control&output data][input data] */ #define F81534A_GPIO_REG 0x10e #define F81534A_GPIO_MODE2_DIR BIT(6) /* 1: input, 0: output */ #define F81534A_GPIO_MODE1_DIR BIT(5) #define F81534A_GPIO_MODE0_DIR BIT(4) #define F81534A_GPIO_MODE2_OUTPUT BIT(2) #define F81534A_GPIO_MODE1_OUTPUT BIT(1) #define F81534A_GPIO_MODE0_OUTPUT BIT(0) #define F81534A_CTRL_CMD_ENABLE_PORT 0x116 struct f81232_private { struct mutex lock; u8 modem_control; u8 modem_status; u8 shadow_lcr; speed_t baud_base; struct work_struct lsr_work; struct work_struct interrupt_work; struct usb_serial_port *port; }; static u32 const baudrate_table[] = { 115200, 921600, 1152000, 1500000 }; static u8 const clock_table[] = { F81232_CLK_1_846_MHZ, F81232_CLK_14_77_MHZ, F81232_CLK_18_46_MHZ, F81232_CLK_24_MHZ }; static int calc_baud_divisor(speed_t baudrate, speed_t clockrate) { return DIV_ROUND_CLOSEST(clockrate, baudrate); } static int f81232_get_register(struct usb_serial_port *port, u16 reg, u8 *val) { int status; struct usb_device *dev = port->serial->dev; status = usb_control_msg_recv(dev, 0, F81232_REGISTER_REQUEST, F81232_GET_REGISTER, reg, 0, val, sizeof(*val), USB_CTRL_GET_TIMEOUT, GFP_KERNEL); if (status) { dev_err(&port->dev, "%s failed status: %d\n", __func__, status); status = usb_translate_errors(status); } return status; } static int f81232_set_register(struct usb_serial_port *port, u16 reg, u8 val) { int status; struct usb_device *dev = port->serial->dev; status = usb_control_msg_send(dev, 0, F81232_REGISTER_REQUEST, F81232_SET_REGISTER, reg, 0, &val, sizeof(val), USB_CTRL_SET_TIMEOUT, GFP_KERNEL); if (status) { dev_err(&port->dev, "%s failed status: %d\n", __func__, status); status = usb_translate_errors(status); } return status; } static int f81232_set_mask_register(struct usb_serial_port *port, u16 reg, u8 mask, u8 val) { int status; u8 tmp; status = f81232_get_register(port, reg, &tmp); if (status) return status; tmp = (tmp & ~mask) | (val & mask); return f81232_set_register(port, reg, tmp); } static void f81232_read_msr(struct usb_serial_port *port) { int status; u8 current_msr; struct tty_struct *tty; struct f81232_private *priv = usb_get_serial_port_data(port); mutex_lock(&priv->lock); status = f81232_get_register(port, MODEM_STATUS_REGISTER, &current_msr); if (status) { dev_err(&port->dev, "%s fail, status: %d\n", __func__, status); mutex_unlock(&priv->lock); return; } if (!(current_msr & UART_MSR_ANY_DELTA)) { mutex_unlock(&priv->lock); return; } priv->modem_status = current_msr; if (current_msr & UART_MSR_DCTS) port->icount.cts++; if (current_msr & UART_MSR_DDSR) port->icount.dsr++; if (current_msr & UART_MSR_TERI) port->icount.rng++; if (current_msr & UART_MSR_DDCD) { port->icount.dcd++; tty = tty_port_tty_get(&port->port); if (tty) { usb_serial_handle_dcd_change(port, tty, current_msr & UART_MSR_DCD); tty_kref_put(tty); } } wake_up_interruptible(&port->port.delta_msr_wait); mutex_unlock(&priv->lock); } static int f81232_set_mctrl(struct usb_serial_port *port, unsigned int set, unsigned int clear) { u8 val; int status; struct f81232_private *priv = usb_get_serial_port_data(port); if (((set | clear) & (TIOCM_DTR | TIOCM_RTS)) == 0) return 0; /* no change */ /* 'set' takes precedence over 'clear' */ clear &= ~set; /* force enable interrupt with OUT2 */ mutex_lock(&priv->lock); val = UART_MCR_OUT2 | priv->modem_control; if (clear & TIOCM_DTR) val &= ~UART_MCR_DTR; if (clear & TIOCM_RTS) val &= ~UART_MCR_RTS; if (set & TIOCM_DTR) val |= UART_MCR_DTR; if (set & TIOCM_RTS) val |= UART_MCR_RTS; dev_dbg(&port->dev, "%s new:%02x old:%02x\n", __func__, val, priv->modem_control); status = f81232_set_register(port, MODEM_CONTROL_REGISTER, val); if (status) { dev_err(&port->dev, "%s set MCR status < 0\n", __func__); mutex_unlock(&priv->lock); return status; } priv->modem_control = val; mutex_unlock(&priv->lock); return 0; } static void f81232_update_line_status(struct usb_serial_port *port, unsigned char *data, size_t actual_length) { struct f81232_private *priv = usb_get_serial_port_data(port); if (!actual_length) return; switch (data[0] & 0x07) { case 0x00: /* msr change */ dev_dbg(&port->dev, "IIR: MSR Change: %02x\n", data[0]); schedule_work(&priv->interrupt_work); break; case 0x02: /* tx-empty */ break; case 0x04: /* rx data available */ break; case 0x06: /* lsr change */ /* we can forget it. the LSR will read from bulk-in */ dev_dbg(&port->dev, "IIR: LSR Change: %02x\n", data[0]); break; } } static void f81232_read_int_callback(struct urb *urb) { struct usb_serial_port *port = urb->context; unsigned char *data = urb->transfer_buffer; unsigned int actual_length = urb->actual_length; int status = urb->status; int retval; switch (status) { case 0: /* success */ break; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* this urb is terminated, clean up */ dev_dbg(&port->dev, "%s - urb shutting down with status: %d\n", __func__, status); return; default: dev_dbg(&port->dev, "%s - nonzero urb status received: %d\n", __func__, status); goto exit; } usb_serial_debug_data(&port->dev, __func__, urb->actual_length, urb->transfer_buffer); f81232_update_line_status(port, data, actual_length); exit: retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval) dev_err(&urb->dev->dev, "%s - usb_submit_urb failed with result %d\n", __func__, retval); } static char f81232_handle_lsr(struct usb_serial_port *port, u8 lsr) { struct f81232_private *priv = usb_get_serial_port_data(port); char tty_flag = TTY_NORMAL; if (!(lsr & UART_LSR_BRK_ERROR_BITS)) return tty_flag; if (lsr & UART_LSR_BI) { tty_flag = TTY_BREAK; port->icount.brk++; usb_serial_handle_break(port); } else if (lsr & UART_LSR_PE) { tty_flag = TTY_PARITY; port->icount.parity++; } else if (lsr & UART_LSR_FE) { tty_flag = TTY_FRAME; port->icount.frame++; } if (lsr & UART_LSR_OE) { port->icount.overrun++; schedule_work(&priv->lsr_work); tty_insert_flip_char(&port->port, 0, TTY_OVERRUN); } return tty_flag; } static void f81232_process_read_urb(struct urb *urb) { struct usb_serial_port *port = urb->context; unsigned char *data = urb->transfer_buffer; char tty_flag; unsigned int i; u8 lsr; /* * When opening the port we get a 1-byte packet with the current LSR, * which we discard. */ if ((urb->actual_length < 2) || (urb->actual_length % 2)) return; /* bulk-in data: [LSR(1Byte)+DATA(1Byte)][LSR(1Byte)+DATA(1Byte)]... */ for (i = 0; i < urb->actual_length; i += 2) { lsr = data[i]; tty_flag = f81232_handle_lsr(port, lsr); if (port->sysrq) { if (usb_serial_handle_sysrq_char(port, data[i + 1])) continue; } tty_insert_flip_char(&port->port, data[i + 1], tty_flag); } tty_flip_buffer_push(&port->port); } static void f81534a_process_read_urb(struct urb *urb) { struct usb_serial_port *port = urb->context; unsigned char *data = urb->transfer_buffer; char tty_flag; unsigned int i; u8 lsr; u8 len; if (urb->actual_length < 3) { dev_err(&port->dev, "short message received: %d\n", urb->actual_length); return; } len = data[0]; if (len != urb->actual_length) { dev_err(&port->dev, "malformed message received: %d (%d)\n", urb->actual_length, len); return; } /* bulk-in data: [LEN][Data.....][LSR] */ lsr = data[len - 1]; tty_flag = f81232_handle_lsr(port, lsr); if (port->sysrq) { for (i = 1; i < len - 1; ++i) { if (!usb_serial_handle_sysrq_char(port, data[i])) { tty_insert_flip_char(&port->port, data[i], tty_flag); } } } else { tty_insert_flip_string_fixed_flag(&port->port, &data[1], tty_flag, len - 2); } tty_flip_buffer_push(&port->port); } static int f81232_break_ctl(struct tty_struct *tty, int break_state) { struct usb_serial_port *port = tty->driver_data; struct f81232_private *priv = usb_get_serial_port_data(port); int status; mutex_lock(&priv->lock); if (break_state) priv->shadow_lcr |= UART_LCR_SBC; else priv->shadow_lcr &= ~UART_LCR_SBC; status = f81232_set_register(port, LINE_CONTROL_REGISTER, priv->shadow_lcr); if (status) dev_err(&port->dev, "set break failed: %d\n", status); mutex_unlock(&priv->lock); return status; } static int f81232_find_clk(speed_t baudrate) { int idx; for (idx = 0; idx < ARRAY_SIZE(baudrate_table); ++idx) { if (baudrate <= baudrate_table[idx] && baudrate_table[idx] % baudrate == 0) return idx; } return -EINVAL; } static void f81232_set_baudrate(struct tty_struct *tty, struct usb_serial_port *port, speed_t baudrate, speed_t old_baudrate) { struct f81232_private *priv = usb_get_serial_port_data(port); u8 lcr; int divisor; int status = 0; int i; int idx; speed_t baud_list[] = { baudrate, old_baudrate, F81232_DEF_BAUDRATE }; for (i = 0; i < ARRAY_SIZE(baud_list); ++i) { baudrate = baud_list[i]; if (baudrate == 0) { tty_encode_baud_rate(tty, 0, 0); return; } idx = f81232_find_clk(baudrate); if (idx >= 0) { tty_encode_baud_rate(tty, baudrate, baudrate); break; } } if (idx < 0) return; priv->baud_base = baudrate_table[idx]; divisor = calc_baud_divisor(baudrate, priv->baud_base); status = f81232_set_mask_register(port, F81232_CLK_REGISTER, F81232_CLK_MASK, clock_table[idx]); if (status) { dev_err(&port->dev, "%s failed to set CLK_REG: %d\n", __func__, status); return; } status = f81232_get_register(port, LINE_CONTROL_REGISTER, &lcr); /* get LCR */ if (status) { dev_err(&port->dev, "%s failed to get LCR: %d\n", __func__, status); return; } status = f81232_set_register(port, LINE_CONTROL_REGISTER, lcr | UART_LCR_DLAB); /* Enable DLAB */ if (status) { dev_err(&port->dev, "%s failed to set DLAB: %d\n", __func__, status); return; } status = f81232_set_register(port, RECEIVE_BUFFER_REGISTER, divisor & 0x00ff); /* low */ if (status) { dev_err(&port->dev, "%s failed to set baudrate MSB: %d\n", __func__, status); goto reapply_lcr; } status = f81232_set_register(port, INTERRUPT_ENABLE_REGISTER, (divisor & 0xff00) >> 8); /* high */ if (status) { dev_err(&port->dev, "%s failed to set baudrate LSB: %d\n", __func__, status); } reapply_lcr: status = f81232_set_register(port, LINE_CONTROL_REGISTER, lcr & ~UART_LCR_DLAB); if (status) { dev_err(&port->dev, "%s failed to set DLAB: %d\n", __func__, status); } } static int f81232_port_enable(struct usb_serial_port *port) { u8 val; int status; /* fifo on, trigger8, clear TX/RX*/ val = UART_FCR_TRIGGER_8 | UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR | UART_FCR_CLEAR_XMIT; status = f81232_set_register(port, FIFO_CONTROL_REGISTER, val); if (status) { dev_err(&port->dev, "%s failed to set FCR: %d\n", __func__, status); return status; } /* MSR Interrupt only, LSR will read from Bulk-in odd byte */ status = f81232_set_register(port, INTERRUPT_ENABLE_REGISTER, UART_IER_MSI); if (status) { dev_err(&port->dev, "%s failed to set IER: %d\n", __func__, status); return status; } return 0; } static int f81232_port_disable(struct usb_serial_port *port) { int status; status = f81232_set_register(port, INTERRUPT_ENABLE_REGISTER, 0); if (status) { dev_err(&port->dev, "%s failed to set IER: %d\n", __func__, status); return status; } return 0; } static void f81232_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { struct f81232_private *priv = usb_get_serial_port_data(port); u8 new_lcr = 0; int status = 0; speed_t baudrate; speed_t old_baud; /* Don't change anything if nothing has changed */ if (old_termios && !tty_termios_hw_change(&tty->termios, old_termios)) return; if (C_BAUD(tty) == B0) f81232_set_mctrl(port, 0, TIOCM_DTR | TIOCM_RTS); else if (old_termios && (old_termios->c_cflag & CBAUD) == B0) f81232_set_mctrl(port, TIOCM_DTR | TIOCM_RTS, 0); baudrate = tty_get_baud_rate(tty); if (baudrate > 0) { if (old_termios) old_baud = tty_termios_baud_rate(old_termios); else old_baud = F81232_DEF_BAUDRATE; f81232_set_baudrate(tty, port, baudrate, old_baud); } if (C_PARENB(tty)) { new_lcr |= UART_LCR_PARITY; if (!C_PARODD(tty)) new_lcr |= UART_LCR_EPAR; if (C_CMSPAR(tty)) new_lcr |= UART_LCR_SPAR; } if (C_CSTOPB(tty)) new_lcr |= UART_LCR_STOP; new_lcr |= UART_LCR_WLEN(tty_get_char_size(tty->termios.c_cflag)); mutex_lock(&priv->lock); new_lcr |= (priv->shadow_lcr & UART_LCR_SBC); status = f81232_set_register(port, LINE_CONTROL_REGISTER, new_lcr); if (status) { dev_err(&port->dev, "%s failed to set LCR: %d\n", __func__, status); } priv->shadow_lcr = new_lcr; mutex_unlock(&priv->lock); } static int f81232_tiocmget(struct tty_struct *tty) { int r; struct usb_serial_port *port = tty->driver_data; struct f81232_private *port_priv = usb_get_serial_port_data(port); u8 mcr, msr; /* force get current MSR changed state */ f81232_read_msr(port); mutex_lock(&port_priv->lock); mcr = port_priv->modem_control; msr = port_priv->modem_status; mutex_unlock(&port_priv->lock); r = (mcr & UART_MCR_DTR ? TIOCM_DTR : 0) | (mcr & UART_MCR_RTS ? TIOCM_RTS : 0) | (msr & UART_MSR_CTS ? TIOCM_CTS : 0) | (msr & UART_MSR_DCD ? TIOCM_CAR : 0) | (msr & UART_MSR_RI ? TIOCM_RI : 0) | (msr & UART_MSR_DSR ? TIOCM_DSR : 0); return r; } static int f81232_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; return f81232_set_mctrl(port, set, clear); } static int f81232_open(struct tty_struct *tty, struct usb_serial_port *port) { int result; result = f81232_port_enable(port); if (result) return result; /* Setup termios */ if (tty) f81232_set_termios(tty, port, NULL); result = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL); if (result) { dev_err(&port->dev, "%s - failed submitting interrupt urb," " error %d\n", __func__, result); return result; } result = usb_serial_generic_open(tty, port); if (result) { usb_kill_urb(port->interrupt_in_urb); return result; } return 0; } static int f81534a_open(struct tty_struct *tty, struct usb_serial_port *port) { int status; u8 mask; u8 val; val = F81534A_TRIGGER_MULTIPLE_4X | F81534A_FIFO_128BYTE; mask = F81534A_TRIGGER_MASK | F81534A_FIFO_128BYTE; status = f81232_set_mask_register(port, F81534A_MODE_REG, mask, val); if (status) { dev_err(&port->dev, "failed to set MODE_REG: %d\n", status); return status; } return f81232_open(tty, port); } static void f81232_close(struct usb_serial_port *port) { struct f81232_private *port_priv = usb_get_serial_port_data(port); f81232_port_disable(port); usb_serial_generic_close(port); usb_kill_urb(port->interrupt_in_urb); flush_work(&port_priv->interrupt_work); flush_work(&port_priv->lsr_work); } static void f81232_dtr_rts(struct usb_serial_port *port, int on) { if (on) f81232_set_mctrl(port, TIOCM_DTR | TIOCM_RTS, 0); else f81232_set_mctrl(port, 0, TIOCM_DTR | TIOCM_RTS); } static bool f81232_tx_empty(struct usb_serial_port *port) { int status; u8 tmp; status = f81232_get_register(port, LINE_STATUS_REGISTER, &tmp); if (!status) { if ((tmp & UART_LSR_TEMT) != UART_LSR_TEMT) return false; } return true; } static int f81232_carrier_raised(struct usb_serial_port *port) { u8 msr; struct f81232_private *priv = usb_get_serial_port_data(port); mutex_lock(&priv->lock); msr = priv->modem_status; mutex_unlock(&priv->lock); if (msr & UART_MSR_DCD) return 1; return 0; } static void f81232_get_serial(struct tty_struct *tty, struct serial_struct *ss) { struct usb_serial_port *port = tty->driver_data; struct f81232_private *priv = usb_get_serial_port_data(port); ss->baud_base = priv->baud_base; } static void f81232_interrupt_work(struct work_struct *work) { struct f81232_private *priv = container_of(work, struct f81232_private, interrupt_work); f81232_read_msr(priv->port); } static void f81232_lsr_worker(struct work_struct *work) { struct f81232_private *priv; struct usb_serial_port *port; int status; u8 tmp; priv = container_of(work, struct f81232_private, lsr_work); port = priv->port; status = f81232_get_register(port, LINE_STATUS_REGISTER, &tmp); if (status) dev_warn(&port->dev, "read LSR failed: %d\n", status); } static int f81534a_ctrl_set_register(struct usb_interface *intf, u16 reg, u16 size, void *val) { struct usb_device *dev = interface_to_usbdev(intf); int retry = F81534A_ACCESS_REG_RETRY; int status; while (retry--) { status = usb_control_msg_send(dev, 0, F81232_REGISTER_REQUEST, F81232_SET_REGISTER, reg, 0, val, size, USB_CTRL_SET_TIMEOUT, GFP_KERNEL); if (status) { status = usb_translate_errors(status); if (status == -EIO) continue; } break; } if (status) { dev_err(&intf->dev, "failed to set register 0x%x: %d\n", reg, status); } return status; } static int f81534a_ctrl_enable_all_ports(struct usb_interface *intf, bool en) { unsigned char enable[2] = {0}; int status; /* * Enable all available serial ports, define as following: * bit 15 : Reset behavior (when HUB got soft reset) * 0: maintain all serial port enabled state. * 1: disable all serial port. * bit 0~11 : Serial port enable bit. */ if (en) { enable[0] = 0xff; enable[1] = 0x8f; } status = f81534a_ctrl_set_register(intf, F81534A_CTRL_CMD_ENABLE_PORT, sizeof(enable), enable); if (status) dev_err(&intf->dev, "failed to enable ports: %d\n", status); return status; } static int f81534a_ctrl_probe(struct usb_interface *intf, const struct usb_device_id *id) { return f81534a_ctrl_enable_all_ports(intf, true); } static void f81534a_ctrl_disconnect(struct usb_interface *intf) { f81534a_ctrl_enable_all_ports(intf, false); } static int f81534a_ctrl_resume(struct usb_interface *intf) { return f81534a_ctrl_enable_all_ports(intf, true); } static int f81232_port_probe(struct usb_serial_port *port) { struct f81232_private *priv; priv = devm_kzalloc(&port->dev, sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; mutex_init(&priv->lock); INIT_WORK(&priv->interrupt_work, f81232_interrupt_work); INIT_WORK(&priv->lsr_work, f81232_lsr_worker); usb_set_serial_port_data(port, priv); priv->port = port; return 0; } static int f81534a_port_probe(struct usb_serial_port *port) { int status; /* tri-state with pull-high, default RS232 Mode */ status = f81232_set_register(port, F81534A_GPIO_REG, F81534A_GPIO_MODE2_DIR); if (status) return status; return f81232_port_probe(port); } static int f81232_suspend(struct usb_serial *serial, pm_message_t message) { struct usb_serial_port *port = serial->port[0]; struct f81232_private *port_priv = usb_get_serial_port_data(port); int i; for (i = 0; i < ARRAY_SIZE(port->read_urbs); ++i) usb_kill_urb(port->read_urbs[i]); usb_kill_urb(port->interrupt_in_urb); if (port_priv) { flush_work(&port_priv->interrupt_work); flush_work(&port_priv->lsr_work); } return 0; } static int f81232_resume(struct usb_serial *serial) { struct usb_serial_port *port = serial->port[0]; int result; if (tty_port_initialized(&port->port)) { result = usb_submit_urb(port->interrupt_in_urb, GFP_NOIO); if (result) { dev_err(&port->dev, "submit interrupt urb failed: %d\n", result); return result; } } return usb_serial_generic_resume(serial); } static struct usb_serial_driver f81232_device = { .driver = { .name = "f81232", }, .id_table = f81232_id_table, .num_ports = 1, .bulk_in_size = 256, .bulk_out_size = 256, .open = f81232_open, .close = f81232_close, .dtr_rts = f81232_dtr_rts, .carrier_raised = f81232_carrier_raised, .get_serial = f81232_get_serial, .break_ctl = f81232_break_ctl, .set_termios = f81232_set_termios, .tiocmget = f81232_tiocmget, .tiocmset = f81232_tiocmset, .tiocmiwait = usb_serial_generic_tiocmiwait, .tx_empty = f81232_tx_empty, .process_read_urb = f81232_process_read_urb, .read_int_callback = f81232_read_int_callback, .port_probe = f81232_port_probe, .suspend = f81232_suspend, .resume = f81232_resume, }; static struct usb_serial_driver f81534a_device = { .driver = { .name = "f81534a", }, .id_table = f81534a_id_table, .num_ports = 1, .open = f81534a_open, .close = f81232_close, .dtr_rts = f81232_dtr_rts, .carrier_raised = f81232_carrier_raised, .get_serial = f81232_get_serial, .break_ctl = f81232_break_ctl, .set_termios = f81232_set_termios, .tiocmget = f81232_tiocmget, .tiocmset = f81232_tiocmset, .tiocmiwait = usb_serial_generic_tiocmiwait, .tx_empty = f81232_tx_empty, .process_read_urb = f81534a_process_read_urb, .read_int_callback = f81232_read_int_callback, .port_probe = f81534a_port_probe, .suspend = f81232_suspend, .resume = f81232_resume, }; static struct usb_serial_driver * const serial_drivers[] = { &f81232_device, &f81534a_device, NULL, }; static struct usb_driver f81534a_ctrl_driver = { .name = "f81534a_ctrl", .id_table = f81534a_ctrl_id_table, .probe = f81534a_ctrl_probe, .disconnect = f81534a_ctrl_disconnect, .resume = f81534a_ctrl_resume, }; static int __init f81232_init(void) { int status; status = usb_register_driver(&f81534a_ctrl_driver, THIS_MODULE, KBUILD_MODNAME); if (status) return status; status = usb_serial_register_drivers(serial_drivers, KBUILD_MODNAME, combined_id_table); if (status) { usb_deregister(&f81534a_ctrl_driver); return status; } return 0; } static void __exit f81232_exit(void) { usb_serial_deregister_drivers(serial_drivers); usb_deregister(&f81534a_ctrl_driver); } module_init(f81232_init); module_exit(f81232_exit); MODULE_DESCRIPTION("Fintek F81232/532A/534A/535/536 USB to serial driver"); MODULE_AUTHOR("Greg Kroah-Hartman <gregkh@linuxfoundation.org>"); MODULE_AUTHOR("Peter Hong <peter_hong@fintek.com.tw>"); MODULE_LICENSE("GPL v2");
2 2 1 1 5 5 2 4 3 1 3 1 1 1 4 1 2 3 5 14 1 1 12 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 /* * userio kernel serio device emulation module * Copyright (C) 2015 Red Hat * Copyright (C) 2015 Stephen Chandler Paul <thatslyude@gmail.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2 of the License, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser * General Public License for more details. */ #include <linux/circ_buf.h> #include <linux/mutex.h> #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/serio.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/miscdevice.h> #include <linux/sched.h> #include <linux/poll.h> #include <uapi/linux/userio.h> #define USERIO_NAME "userio" #define USERIO_BUFSIZE 16 static struct miscdevice userio_misc; struct userio_device { struct serio *serio; struct mutex mutex; bool running; u8 head; u8 tail; spinlock_t buf_lock; unsigned char buf[USERIO_BUFSIZE]; wait_queue_head_t waitq; }; /** * userio_device_write - Write data from serio to a userio device in userspace * @id: The serio port for the userio device * @val: The data to write to the device */ static int userio_device_write(struct serio *id, unsigned char val) { struct userio_device *userio = id->port_data; scoped_guard(spinlock_irqsave, &userio->buf_lock) { userio->buf[userio->head] = val; userio->head = (userio->head + 1) % USERIO_BUFSIZE; if (userio->head == userio->tail) dev_warn(userio_misc.this_device, "Buffer overflowed, userio client isn't keeping up"); } wake_up_interruptible(&userio->waitq); return 0; } static int userio_char_open(struct inode *inode, struct file *file) { struct userio_device *userio __free(kfree) = kzalloc(sizeof(*userio), GFP_KERNEL); if (!userio) return -ENOMEM; mutex_init(&userio->mutex); spin_lock_init(&userio->buf_lock); init_waitqueue_head(&userio->waitq); userio->serio = kzalloc(sizeof(*userio->serio), GFP_KERNEL); if (!userio->serio) return -ENOMEM; userio->serio->write = userio_device_write; userio->serio->port_data = userio; file->private_data = no_free_ptr(userio); return 0; } static int userio_char_release(struct inode *inode, struct file *file) { struct userio_device *userio = file->private_data; if (userio->running) { /* * Don't free the serio port here, serio_unregister_port() * does it for us. */ serio_unregister_port(userio->serio); } else { kfree(userio->serio); } kfree(userio); return 0; } static size_t userio_fetch_data(struct userio_device *userio, u8 *buf, size_t count, size_t *copylen) { size_t available, len; guard(spinlock_irqsave)(&userio->buf_lock); available = CIRC_CNT_TO_END(userio->head, userio->tail, USERIO_BUFSIZE); len = min(available, count); if (len) { memcpy(buf, &userio->buf[userio->tail], len); userio->tail = (userio->tail + len) % USERIO_BUFSIZE; } *copylen = len; return available; } static ssize_t userio_char_read(struct file *file, char __user *user_buffer, size_t count, loff_t *ppos) { struct userio_device *userio = file->private_data; int error; size_t available, copylen; u8 buf[USERIO_BUFSIZE]; /* * By the time we get here, the data that was waiting might have * been taken by another thread. Grab the buffer lock and check if * there's still any data waiting, otherwise repeat this process * until we have data (unless the file descriptor is non-blocking * of course). */ for (;;) { available = userio_fetch_data(userio, buf, count, &copylen); if (available) break; /* buffer was/is empty */ if (file->f_flags & O_NONBLOCK) return -EAGAIN; /* * count == 0 is special - no IO is done but we check * for error conditions (see above). */ if (count == 0) return 0; error = wait_event_interruptible(userio->waitq, userio->head != userio->tail); if (error) return error; } if (copylen) if (copy_to_user(user_buffer, buf, copylen)) return -EFAULT; return copylen; } static int userio_execute_cmd(struct userio_device *userio, const struct userio_cmd *cmd) { switch (cmd->type) { case USERIO_CMD_REGISTER: if (!userio->serio->id.type) { dev_warn(userio_misc.this_device, "No port type given on /dev/userio\n"); return -EINVAL; } if (userio->running) { dev_warn(userio_misc.this_device, "Begin command sent, but we're already running\n"); return -EBUSY; } userio->running = true; serio_register_port(userio->serio); break; case USERIO_CMD_SET_PORT_TYPE: if (userio->running) { dev_warn(userio_misc.this_device, "Can't change port type on an already running userio instance\n"); return -EBUSY; } userio->serio->id.type = cmd->data; break; case USERIO_CMD_SEND_INTERRUPT: if (!userio->running) { dev_warn(userio_misc.this_device, "The device must be registered before sending interrupts\n"); return -ENODEV; } serio_interrupt(userio->serio, cmd->data, 0); break; default: return -EOPNOTSUPP; } return 0; } static ssize_t userio_char_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct userio_device *userio = file->private_data; struct userio_cmd cmd; int error; if (count != sizeof(cmd)) { dev_warn(userio_misc.this_device, "Invalid payload size\n"); return -EINVAL; } if (copy_from_user(&cmd, buffer, sizeof(cmd))) return -EFAULT; scoped_cond_guard(mutex_intr, return -EINTR, &userio->mutex) { error = userio_execute_cmd(userio, &cmd); if (error) return error; } return count; } static __poll_t userio_char_poll(struct file *file, poll_table *wait) { struct userio_device *userio = file->private_data; poll_wait(file, &userio->waitq, wait); if (userio->head != userio->tail) return EPOLLIN | EPOLLRDNORM; return 0; } static const struct file_operations userio_fops = { .owner = THIS_MODULE, .open = userio_char_open, .release = userio_char_release, .read = userio_char_read, .write = userio_char_write, .poll = userio_char_poll, }; static struct miscdevice userio_misc = { .fops = &userio_fops, .minor = USERIO_MINOR, .name = USERIO_NAME, }; module_driver(userio_misc, misc_register, misc_deregister); MODULE_ALIAS_MISCDEV(USERIO_MINOR); MODULE_ALIAS("devname:" USERIO_NAME); MODULE_AUTHOR("Stephen Chandler Paul <thatslyude@gmail.com>"); MODULE_DESCRIPTION("Virtual Serio Device Support"); MODULE_LICENSE("GPL");
667 34 18 96 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 /* SPDX-License-Identifier: GPL-2.0 */ /* File: linux/posix_acl.h (C) 2002 Andreas Gruenbacher, <a.gruenbacher@computer.org> */ #ifndef __LINUX_POSIX_ACL_H #define __LINUX_POSIX_ACL_H #include <linux/bug.h> #include <linux/slab.h> #include <linux/rcupdate.h> #include <linux/refcount.h> #include <uapi/linux/posix_acl.h> struct user_namespace; struct posix_acl_entry { short e_tag; unsigned short e_perm; union { kuid_t e_uid; kgid_t e_gid; }; }; struct posix_acl { refcount_t a_refcount; unsigned int a_count; struct rcu_head a_rcu; struct posix_acl_entry a_entries[] __counted_by(a_count); }; #define FOREACH_ACL_ENTRY(pa, acl, pe) \ for(pa=(acl)->a_entries, pe=pa+(acl)->a_count; pa<pe; pa++) /* * Duplicate an ACL handle. */ static inline struct posix_acl * posix_acl_dup(struct posix_acl *acl) { if (acl) refcount_inc(&acl->a_refcount); return acl; } /* * Free an ACL handle. */ static inline void posix_acl_release(struct posix_acl *acl) { if (acl && refcount_dec_and_test(&acl->a_refcount)) kfree_rcu(acl, a_rcu); } /* posix_acl.c */ extern void posix_acl_init(struct posix_acl *, int); extern struct posix_acl *posix_acl_alloc(unsigned int count, gfp_t flags); extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t); extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *); extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *); extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t); extern struct posix_acl *get_posix_acl(struct inode *, int); int set_posix_acl(struct mnt_idmap *, struct dentry *, int, struct posix_acl *); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type); struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags); #ifdef CONFIG_FS_POSIX_ACL int posix_acl_chmod(struct mnt_idmap *, struct dentry *, umode_t); extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **, struct posix_acl **); int posix_acl_update_mode(struct mnt_idmap *, struct inode *, umode_t *, struct posix_acl **); int simple_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); extern int simple_acl_create(struct inode *, struct inode *); struct posix_acl *get_cached_acl(struct inode *inode, int type); void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl); void forget_cached_acl(struct inode *inode, int type); void forget_all_cached_acls(struct inode *inode); int posix_acl_valid(struct user_namespace *, const struct posix_acl *); int posix_acl_permission(struct mnt_idmap *, struct inode *, const struct posix_acl *, int); static inline void cache_no_acl(struct inode *inode) { inode->i_acl = NULL; inode->i_default_acl = NULL; } int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, struct posix_acl *kacl); struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name); int posix_acl_listxattr(struct inode *inode, char **buffer, ssize_t *remaining_size); #else static inline int posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode) { return 0; } #define simple_set_acl NULL static inline int simple_acl_create(struct inode *dir, struct inode *inode) { return 0; } static inline void cache_no_acl(struct inode *inode) { } static inline int posix_acl_create(struct inode *inode, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { *default_acl = *acl = NULL; return 0; } static inline void forget_all_cached_acls(struct inode *inode) { } static inline int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct posix_acl *acl) { return -EOPNOTSUPP; } static inline struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return ERR_PTR(-EOPNOTSUPP); } static inline int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name) { return -EOPNOTSUPP; } static inline int posix_acl_listxattr(struct inode *inode, char **buffer, ssize_t *remaining_size) { return 0; } #endif /* CONFIG_FS_POSIX_ACL */ struct posix_acl *get_inode_acl(struct inode *inode, int type); #endif /* __LINUX_POSIX_ACL_H */
4 12 12 1 11 2 10 12 7 7 2 1 4 1 4 6 2 2 2 2 2 2 2 3 3 3 3 3 2 1 1 1 1 1 1 1 5 1 4 3 1 4 1 1 2 2 2 3 4 4 4 4 3 1 4 3 1 4 1 1 1 1 1 26 26 21 12 10 13 13 4 4 4 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2004 * Portions Copyright (C) Christoph Hellwig, 2001-2002 */ #include <linux/fs.h> #include <linux/namei.h> #include <linux/ctype.h> #include <linux/quotaops.h> #include <linux/exportfs.h> #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_inode.h" #include "jfs_dinode.h" #include "jfs_dmap.h" #include "jfs_unicode.h" #include "jfs_metapage.h" #include "jfs_xattr.h" #include "jfs_acl.h" #include "jfs_debug.h" /* * forward references */ const struct dentry_operations jfs_ci_dentry_operations; static s64 commitZeroLink(tid_t, struct inode *); /* * NAME: free_ea_wmap(inode) * * FUNCTION: free uncommitted extended attributes from working map * */ static inline void free_ea_wmap(struct inode *inode) { dxd_t *ea = &JFS_IP(inode)->ea; if (ea->flag & DXD_EXTENT) { /* free EA pages from cache */ invalidate_dxd_metapages(inode, *ea); dbFree(inode, addressDXD(ea), lengthDXD(ea)); } ea->flag = 0; } /* * NAME: jfs_create(dip, dentry, mode) * * FUNCTION: create a regular file in the parent directory <dip> * with name = <from dentry> and mode = <mode> * * PARAMETER: dip - parent directory vnode * dentry - dentry of new file * mode - create mode (rwxrwxrwx). * nd- nd struct * * RETURN: Errors from subroutines * */ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip, struct dentry *dentry, umode_t mode, bool excl) { int rc = 0; tid_t tid; /* transaction id */ struct inode *ip = NULL; /* child directory inode */ ino_t ino; struct component_name dname; /* child directory name */ struct btstack btstack; struct inode *iplist[2]; struct tblock *tblk; jfs_info("jfs_create: dip:0x%p name:%pd", dip, dentry); rc = dquot_initialize(dip); if (rc) goto out1; /* * search parent directory for entry/freespace * (dtSearch() returns parent directory page pinned) */ if ((rc = get_UCSname(&dname, dentry))) goto out1; /* * Either iAlloc() or txBegin() may block. Deadlock can occur if we * block there while holding dtree page, so we allocate the inode & * begin the transaction before we search the directory. */ ip = ialloc(dip, mode); if (IS_ERR(ip)) { rc = PTR_ERR(ip); goto out2; } tid = txBegin(dip->i_sb, 0); mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_acl(tid, ip, dip); if (rc) goto out3; rc = jfs_init_security(tid, ip, dip, &dentry->d_name); if (rc) { txAbort(tid, 0); goto out3; } if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) { jfs_err("jfs_create: dtSearch returned %d", rc); txAbort(tid, 0); goto out3; } tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_CREATE; tblk->ino = ip->i_ino; tblk->u.ixpxd = JFS_IP(ip)->ixpxd; iplist[0] = dip; iplist[1] = ip; /* * initialize the child XAD tree root in-line in inode */ xtInitRoot(tid, ip); /* * create entry in parent directory for child directory * (dtInsert() releases parent directory page) */ ino = ip->i_ino; if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) { if (rc == -EIO) { jfs_err("jfs_create: dtInsert returned -EIO"); txAbort(tid, 1); /* Marks Filesystem dirty */ } else txAbort(tid, 0); /* Filesystem full */ goto out3; } ip->i_op = &jfs_file_inode_operations; ip->i_fop = &jfs_file_operations; ip->i_mapping->a_ops = &jfs_aops; mark_inode_dirty(ip); inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip)); mark_inode_dirty(dip); rc = txCommit(tid, 2, &iplist[0], 0); out3: txEnd(tid); mutex_unlock(&JFS_IP(ip)->commit_mutex); mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); clear_nlink(ip); discard_new_inode(ip); } else { d_instantiate_new(dentry, ip); } out2: free_UCSname(&dname); out1: jfs_info("jfs_create: rc:%d", rc); return rc; } /* * NAME: jfs_mkdir(dip, dentry, mode) * * FUNCTION: create a child directory in the parent directory <dip> * with name = <from dentry> and mode = <mode> * * PARAMETER: dip - parent directory vnode * dentry - dentry of child directory * mode - create mode (rwxrwxrwx). * * RETURN: Errors from subroutines * * note: * EACCES: user needs search+write permission on the parent directory */ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, struct dentry *dentry, umode_t mode) { int rc = 0; tid_t tid; /* transaction id */ struct inode *ip = NULL; /* child directory inode */ ino_t ino; struct component_name dname; /* child directory name */ struct btstack btstack; struct inode *iplist[2]; struct tblock *tblk; jfs_info("jfs_mkdir: dip:0x%p name:%pd", dip, dentry); rc = dquot_initialize(dip); if (rc) goto out1; /* * search parent directory for entry/freespace * (dtSearch() returns parent directory page pinned) */ if ((rc = get_UCSname(&dname, dentry))) goto out1; /* * Either iAlloc() or txBegin() may block. Deadlock can occur if we * block there while holding dtree page, so we allocate the inode & * begin the transaction before we search the directory. */ ip = ialloc(dip, S_IFDIR | mode); if (IS_ERR(ip)) { rc = PTR_ERR(ip); goto out2; } tid = txBegin(dip->i_sb, 0); mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_acl(tid, ip, dip); if (rc) goto out3; rc = jfs_init_security(tid, ip, dip, &dentry->d_name); if (rc) { txAbort(tid, 0); goto out3; } if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) { jfs_err("jfs_mkdir: dtSearch returned %d", rc); txAbort(tid, 0); goto out3; } tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_CREATE; tblk->ino = ip->i_ino; tblk->u.ixpxd = JFS_IP(ip)->ixpxd; iplist[0] = dip; iplist[1] = ip; /* * initialize the child directory in-line in inode */ dtInitRoot(tid, ip, dip->i_ino); /* * create entry in parent directory for child directory * (dtInsert() releases parent directory page) */ ino = ip->i_ino; if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) { if (rc == -EIO) { jfs_err("jfs_mkdir: dtInsert returned -EIO"); txAbort(tid, 1); /* Marks Filesystem dirty */ } else txAbort(tid, 0); /* Filesystem full */ goto out3; } set_nlink(ip, 2); /* for '.' */ ip->i_op = &jfs_dir_inode_operations; ip->i_fop = &jfs_dir_operations; mark_inode_dirty(ip); /* update parent directory inode */ inc_nlink(dip); /* for '..' from child directory */ inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip)); mark_inode_dirty(dip); rc = txCommit(tid, 2, &iplist[0], 0); out3: txEnd(tid); mutex_unlock(&JFS_IP(ip)->commit_mutex); mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); clear_nlink(ip); discard_new_inode(ip); } else { d_instantiate_new(dentry, ip); } out2: free_UCSname(&dname); out1: jfs_info("jfs_mkdir: rc:%d", rc); return rc; } /* * NAME: jfs_rmdir(dip, dentry) * * FUNCTION: remove a link to child directory * * PARAMETER: dip - parent inode * dentry - child directory dentry * * RETURN: -EINVAL - if name is . or .. * -EINVAL - if . or .. exist but are invalid. * errors from subroutines * * note: * if other threads have the directory open when the last link * is removed, the "." and ".." entries, if present, are removed before * rmdir() returns and no new entries may be created in the directory, * but the directory is not removed until the last reference to * the directory is released (cf.unlink() of regular file). */ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) { int rc; tid_t tid; /* transaction id */ struct inode *ip = d_inode(dentry); ino_t ino; struct component_name dname; struct inode *iplist[2]; struct tblock *tblk; jfs_info("jfs_rmdir: dip:0x%p name:%pd", dip, dentry); /* Init inode for quota operations. */ rc = dquot_initialize(dip); if (rc) goto out; rc = dquot_initialize(ip); if (rc) goto out; /* directory must be empty to be removed */ if (!dtEmpty(ip)) { rc = -ENOTEMPTY; goto out; } if ((rc = get_UCSname(&dname, dentry))) { goto out; } tid = txBegin(dip->i_sb, 0); mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); iplist[0] = dip; iplist[1] = ip; tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_DELETE; tblk->u.ip = ip; /* * delete the entry of target directory from parent directory */ ino = ip->i_ino; if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) { jfs_err("jfs_rmdir: dtDelete returned %d", rc); if (rc == -EIO) txAbort(tid, 1); txEnd(tid); mutex_unlock(&JFS_IP(ip)->commit_mutex); mutex_unlock(&JFS_IP(dip)->commit_mutex); goto out2; } /* update parent directory's link count corresponding * to ".." entry of the target directory deleted */ inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip)); inode_dec_link_count(dip); /* * OS/2 could have created EA and/or ACL */ /* free EA from both persistent and working map */ if (JFS_IP(ip)->ea.flag & DXD_EXTENT) { /* free EA pages */ txEA(tid, ip, &JFS_IP(ip)->ea, NULL); } JFS_IP(ip)->ea.flag = 0; /* free ACL from both persistent and working map */ if (JFS_IP(ip)->acl.flag & DXD_EXTENT) { /* free ACL pages */ txEA(tid, ip, &JFS_IP(ip)->acl, NULL); } JFS_IP(ip)->acl.flag = 0; /* mark the target directory as deleted */ clear_nlink(ip); mark_inode_dirty(ip); rc = txCommit(tid, 2, &iplist[0], 0); txEnd(tid); mutex_unlock(&JFS_IP(ip)->commit_mutex); mutex_unlock(&JFS_IP(dip)->commit_mutex); /* * Truncating the directory index table is not guaranteed. It * may need to be done iteratively */ if (test_cflag(COMMIT_Stale, dip)) { if (dip->i_size > 1) jfs_truncate_nolock(dip, 0); clear_cflag(COMMIT_Stale, dip); } out2: free_UCSname(&dname); out: jfs_info("jfs_rmdir: rc:%d", rc); return rc; } /* * NAME: jfs_unlink(dip, dentry) * * FUNCTION: remove a link to object <vp> named by <name> * from parent directory <dvp> * * PARAMETER: dip - inode of parent directory * dentry - dentry of object to be removed * * RETURN: errors from subroutines * * note: * temporary file: if one or more processes have the file open * when the last link is removed, the link will be removed before * unlink() returns, but the removal of the file contents will be * postponed until all references to the files are closed. * * JFS does NOT support unlink() on directories. * */ static int jfs_unlink(struct inode *dip, struct dentry *dentry) { int rc; tid_t tid; /* transaction id */ struct inode *ip = d_inode(dentry); ino_t ino; struct component_name dname; /* object name */ struct inode *iplist[2]; struct tblock *tblk; s64 new_size = 0; int commit_flag; jfs_info("jfs_unlink: dip:0x%p name:%pd", dip, dentry); /* Init inode for quota operations. */ rc = dquot_initialize(dip); if (rc) goto out; rc = dquot_initialize(ip); if (rc) goto out; if ((rc = get_UCSname(&dname, dentry))) goto out; IWRITE_LOCK(ip, RDWRLOCK_NORMAL); tid = txBegin(dip->i_sb, 0); mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); iplist[0] = dip; iplist[1] = ip; /* * delete the entry of target file from parent directory */ ino = ip->i_ino; if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) { jfs_err("jfs_unlink: dtDelete returned %d", rc); if (rc == -EIO) txAbort(tid, 1); /* Marks FS Dirty */ txEnd(tid); mutex_unlock(&JFS_IP(ip)->commit_mutex); mutex_unlock(&JFS_IP(dip)->commit_mutex); IWRITE_UNLOCK(ip); goto out1; } ASSERT(ip->i_nlink); inode_set_mtime_to_ts(dip, inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip))); mark_inode_dirty(dip); /* update target's inode */ inode_dec_link_count(ip); /* * commit zero link count object */ if (ip->i_nlink == 0) { assert(!test_cflag(COMMIT_Nolink, ip)); /* free block resources */ if ((new_size = commitZeroLink(tid, ip)) < 0) { txAbort(tid, 1); /* Marks FS Dirty */ txEnd(tid); mutex_unlock(&JFS_IP(ip)->commit_mutex); mutex_unlock(&JFS_IP(dip)->commit_mutex); IWRITE_UNLOCK(ip); rc = new_size; goto out1; } tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_DELETE; tblk->u.ip = ip; } /* * Incomplete truncate of file data can * result in timing problems unless we synchronously commit the * transaction. */ if (new_size) commit_flag = COMMIT_SYNC; else commit_flag = 0; /* * If xtTruncate was incomplete, commit synchronously to avoid * timing complications */ rc = txCommit(tid, 2, &iplist[0], commit_flag); txEnd(tid); mutex_unlock(&JFS_IP(ip)->commit_mutex); mutex_unlock(&JFS_IP(dip)->commit_mutex); while (new_size && (rc == 0)) { tid = txBegin(dip->i_sb, 0); mutex_lock(&JFS_IP(ip)->commit_mutex); new_size = xtTruncate_pmap(tid, ip, new_size); if (new_size < 0) { txAbort(tid, 1); /* Marks FS Dirty */ rc = new_size; } else rc = txCommit(tid, 2, &iplist[0], COMMIT_SYNC); txEnd(tid); mutex_unlock(&JFS_IP(ip)->commit_mutex); } if (ip->i_nlink == 0) set_cflag(COMMIT_Nolink, ip); IWRITE_UNLOCK(ip); /* * Truncating the directory index table is not guaranteed. It * may need to be done iteratively */ if (test_cflag(COMMIT_Stale, dip)) { if (dip->i_size > 1) jfs_truncate_nolock(dip, 0); clear_cflag(COMMIT_Stale, dip); } out1: free_UCSname(&dname); out: jfs_info("jfs_unlink: rc:%d", rc); return rc; } /* * NAME: commitZeroLink() * * FUNCTION: for non-directory, called by jfs_remove(), * truncate a regular file, directory or symbolic * link to zero length. return 0 if type is not * one of these. * * if the file is currently associated with a VM segment * only permanent disk and inode map resources are freed, * and neither the inode nor indirect blocks are modified * so that the resources can be later freed in the work * map by ctrunc1. * if there is no VM segment on entry, the resources are * freed in both work and permanent map. * (? for temporary file - memory object is cached even * after no reference: * reference count > 0 - ) * * PARAMETERS: cd - pointer to commit data structure. * current inode is the one to truncate. * * RETURN: Errors from subroutines */ static s64 commitZeroLink(tid_t tid, struct inode *ip) { int filetype; struct tblock *tblk; jfs_info("commitZeroLink: tid = %d, ip = 0x%p", tid, ip); filetype = ip->i_mode & S_IFMT; switch (filetype) { case S_IFREG: break; case S_IFLNK: /* fast symbolic link */ if (ip->i_size < IDATASIZE) { ip->i_size = 0; return 0; } break; default: assert(filetype != S_IFDIR); return 0; } set_cflag(COMMIT_Freewmap, ip); /* mark transaction of block map update type */ tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_PMAP; /* * free EA */ if (JFS_IP(ip)->ea.flag & DXD_EXTENT) /* acquire maplock on EA to be freed from block map */ txEA(tid, ip, &JFS_IP(ip)->ea, NULL); /* * free ACL */ if (JFS_IP(ip)->acl.flag & DXD_EXTENT) /* acquire maplock on EA to be freed from block map */ txEA(tid, ip, &JFS_IP(ip)->acl, NULL); /* * free xtree/data (truncate to zero length): * free xtree/data pages from cache if COMMIT_PWMAP, * free xtree/data blocks from persistent block map, and * free xtree/data blocks from working block map if COMMIT_PWMAP; */ if (ip->i_size) return xtTruncate_pmap(tid, ip, 0); return 0; } /* * NAME: jfs_free_zero_link() * * FUNCTION: for non-directory, called by iClose(), * free resources of a file from cache and WORKING map * for a file previously committed with zero link count * while associated with a pager object, * * PARAMETER: ip - pointer to inode of file. */ void jfs_free_zero_link(struct inode *ip) { int type; jfs_info("jfs_free_zero_link: ip = 0x%p", ip); /* return if not reg or symbolic link or if size is * already ok. */ type = ip->i_mode & S_IFMT; switch (type) { case S_IFREG: break; case S_IFLNK: /* if its contained in inode nothing to do */ if (ip->i_size < IDATASIZE) return; break; default: return; } /* * free EA */ if (JFS_IP(ip)->ea.flag & DXD_EXTENT) { s64 xaddr = addressDXD(&JFS_IP(ip)->ea); int xlen = lengthDXD(&JFS_IP(ip)->ea); struct maplock maplock; /* maplock for COMMIT_WMAP */ struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ /* free EA pages from cache */ invalidate_dxd_metapages(ip, JFS_IP(ip)->ea); /* free EA extent from working block map */ maplock.index = 1; pxdlock = (struct pxd_lock *) & maplock; pxdlock->flag = mlckFREEPXD; PXDaddress(&pxdlock->pxd, xaddr); PXDlength(&pxdlock->pxd, xlen); txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); } /* * free ACL */ if (JFS_IP(ip)->acl.flag & DXD_EXTENT) { s64 xaddr = addressDXD(&JFS_IP(ip)->acl); int xlen = lengthDXD(&JFS_IP(ip)->acl); struct maplock maplock; /* maplock for COMMIT_WMAP */ struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ invalidate_dxd_metapages(ip, JFS_IP(ip)->acl); /* free ACL extent from working block map */ maplock.index = 1; pxdlock = (struct pxd_lock *) & maplock; pxdlock->flag = mlckFREEPXD; PXDaddress(&pxdlock->pxd, xaddr); PXDlength(&pxdlock->pxd, xlen); txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); } /* * free xtree/data (truncate to zero length): * free xtree/data pages from cache, and * free xtree/data blocks from working block map; */ if (ip->i_size) xtTruncate(0, ip, 0, COMMIT_WMAP); } /* * NAME: jfs_link(vp, dvp, name, crp) * * FUNCTION: create a link to <vp> by the name = <name> * in the parent directory <dvp> * * PARAMETER: vp - target object * dvp - parent directory of new link * name - name of new link to target object * crp - credential * * RETURN: Errors from subroutines * * note: * JFS does NOT support link() on directories (to prevent circular * path in the directory hierarchy); * EPERM: the target object is a directory, and either the caller * does not have appropriate privileges or the implementation prohibits * using link() on directories [XPG4.2]. * * JFS does NOT support links between file systems: * EXDEV: target object and new link are on different file systems and * implementation does not support links between file systems [XPG4.2]. */ static int jfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int rc; tid_t tid; struct inode *ip = d_inode(old_dentry); ino_t ino; struct component_name dname; struct btstack btstack; struct inode *iplist[2]; jfs_info("jfs_link: %pd %pd", old_dentry, dentry); rc = dquot_initialize(dir); if (rc) goto out; if (isReadOnly(ip)) { jfs_error(ip->i_sb, "read-only filesystem\n"); return -EROFS; } tid = txBegin(ip->i_sb, 0); mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); /* * scan parent directory for entry/freespace */ if ((rc = get_UCSname(&dname, dentry))) goto out_tx; if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) goto free_dname; /* * create entry for new link in parent directory */ ino = ip->i_ino; if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) goto free_dname; /* update object inode */ inc_nlink(ip); /* for new link */ inode_set_ctime_current(ip); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); ihold(ip); iplist[0] = ip; iplist[1] = dir; rc = txCommit(tid, 2, &iplist[0], 0); if (rc) { drop_nlink(ip); /* never instantiated */ iput(ip); } else d_instantiate(dentry, ip); free_dname: free_UCSname(&dname); out_tx: txEnd(tid); mutex_unlock(&JFS_IP(ip)->commit_mutex); mutex_unlock(&JFS_IP(dir)->commit_mutex); out: jfs_info("jfs_link: rc:%d", rc); return rc; } /* * NAME: jfs_symlink(dip, dentry, name) * * FUNCTION: creates a symbolic link to <symlink> by name <name> * in directory <dip> * * PARAMETER: dip - parent directory vnode * dentry - dentry of symbolic link * name - the path name of the existing object * that will be the source of the link * * RETURN: errors from subroutines * * note: * ENAMETOOLONG: pathname resolution of a symbolic link produced * an intermediate result whose length exceeds PATH_MAX [XPG4.2] */ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, struct dentry *dentry, const char *name) { int rc; tid_t tid; ino_t ino = 0; struct component_name dname; u32 ssize; /* source pathname size */ struct btstack btstack; struct inode *ip; s64 xlen = 0; int bmask = 0, xsize; s64 xaddr; struct metapage *mp; struct super_block *sb; struct tblock *tblk; struct inode *iplist[2]; jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); rc = dquot_initialize(dip); if (rc) goto out1; ssize = strlen(name) + 1; /* * search parent directory for entry/freespace * (dtSearch() returns parent directory page pinned) */ if ((rc = get_UCSname(&dname, dentry))) goto out1; /* * allocate on-disk/in-memory inode for symbolic link: * (iAlloc() returns new, locked inode) */ ip = ialloc(dip, S_IFLNK | 0777); if (IS_ERR(ip)) { rc = PTR_ERR(ip); goto out2; } tid = txBegin(dip->i_sb, 0); mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_security(tid, ip, dip, &dentry->d_name); if (rc) goto out3; tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_CREATE; tblk->ino = ip->i_ino; tblk->u.ixpxd = JFS_IP(ip)->ixpxd; /* fix symlink access permission * (dir_create() ANDs in the u.u_cmask, * but symlinks really need to be 777 access) */ ip->i_mode |= 0777; /* * write symbolic link target path name */ xtInitRoot(tid, ip); /* * write source path name inline in on-disk inode (fast symbolic link) */ if (ssize <= IDATASIZE) { ip->i_op = &jfs_fast_symlink_inode_operations; ip->i_link = JFS_IP(ip)->i_inline_all; memcpy(ip->i_link, name, ssize); ip->i_size = ssize - 1; /* * if symlink is > 128 bytes, we don't have the space to * store inline extended attributes */ if (ssize > sizeof (JFS_IP(ip)->i_inline)) JFS_IP(ip)->mode2 &= ~INLINEEA; jfs_info("jfs_symlink: fast symlink added ssize:%u name:%s ", ssize, name); } /* * write source path name in a single extent */ else { jfs_info("jfs_symlink: allocate extent ip:0x%p", ip); ip->i_op = &jfs_symlink_inode_operations; inode_nohighmem(ip); ip->i_mapping->a_ops = &jfs_aops; /* * even though the data of symlink object (source * path name) is treated as non-journaled user data, * it is read/written thru buffer cache for performance. */ sb = ip->i_sb; bmask = JFS_SBI(sb)->bsize - 1; xsize = (ssize + bmask) & ~bmask; xaddr = 0; xlen = xsize >> JFS_SBI(sb)->l2bsize; if ((rc = xtInsert(tid, ip, 0, 0, xlen, &xaddr, 0))) { txAbort(tid, 0); goto out3; } ip->i_size = ssize - 1; while (ssize) { /* This is kind of silly since PATH_MAX == 4K */ u32 copy_size = min_t(u32, ssize, PSIZE); mp = get_metapage(ip, xaddr, PSIZE, 1); if (mp == NULL) { xtTruncate(tid, ip, 0, COMMIT_PWMAP); rc = -EIO; txAbort(tid, 0); goto out3; } memcpy(mp->data, name, copy_size); flush_metapage(mp); ssize -= copy_size; name += copy_size; xaddr += JFS_SBI(sb)->nbperpage; } } /* * create entry for symbolic link in parent directory */ rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE); if (rc == 0) { ino = ip->i_ino; rc = dtInsert(tid, dip, &dname, &ino, &btstack); } if (rc) { if (xlen) xtTruncate(tid, ip, 0, COMMIT_PWMAP); txAbort(tid, 0); /* discard new inode */ goto out3; } mark_inode_dirty(ip); inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip)); mark_inode_dirty(dip); /* * commit update of parent directory and link object */ iplist[0] = dip; iplist[1] = ip; rc = txCommit(tid, 2, &iplist[0], 0); out3: txEnd(tid); mutex_unlock(&JFS_IP(ip)->commit_mutex); mutex_unlock(&JFS_IP(dip)->commit_mutex); if (rc) { free_ea_wmap(ip); clear_nlink(ip); discard_new_inode(ip); } else { d_instantiate_new(dentry, ip); } out2: free_UCSname(&dname); out1: jfs_info("jfs_symlink: rc:%d", rc); return rc; } /* * NAME: jfs_rename * * FUNCTION: rename a file or directory */ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct btstack btstack; ino_t ino; struct component_name new_dname; struct inode *new_ip; struct component_name old_dname; struct inode *old_ip; int rc; tid_t tid; struct tlock *tlck; struct dt_lock *dtlck; struct lv *lv; int ipcount; struct inode *iplist[4]; struct tblock *tblk; s64 new_size = 0; int commit_flag; if (flags & ~RENAME_NOREPLACE) return -EINVAL; jfs_info("jfs_rename: %pd %pd", old_dentry, new_dentry); rc = dquot_initialize(old_dir); if (rc) goto out1; rc = dquot_initialize(new_dir); if (rc) goto out1; old_ip = d_inode(old_dentry); new_ip = d_inode(new_dentry); if ((rc = get_UCSname(&old_dname, old_dentry))) goto out1; if ((rc = get_UCSname(&new_dname, new_dentry))) goto out2; /* * Make sure source inode number is what we think it is */ rc = dtSearch(old_dir, &old_dname, &ino, &btstack, JFS_LOOKUP); if (rc || (ino != old_ip->i_ino)) { rc = -ENOENT; goto out3; } /* * Make sure dest inode number (if any) is what we think it is */ rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP); if (!rc) { if ((!new_ip) || (ino != new_ip->i_ino)) { rc = -ESTALE; goto out3; } } else if (rc != -ENOENT) goto out3; else if (new_ip) { /* no entry exists, but one was expected */ rc = -ESTALE; goto out3; } if (S_ISDIR(old_ip->i_mode)) { if (new_ip) { if (!dtEmpty(new_ip)) { rc = -ENOTEMPTY; goto out3; } } } else if (new_ip) { IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); /* Init inode for quota operations. */ rc = dquot_initialize(new_ip); if (rc) goto out_unlock; } /* * The real work starts here */ tid = txBegin(new_dir->i_sb, 0); /* * How do we know the locking is safe from deadlocks? * The vfs does the hard part for us. Any time we are taking nested * commit_mutexes, the vfs already has i_mutex held on the parent. * Here, the vfs has already taken i_mutex on both old_dir and new_dir. */ mutex_lock_nested(&JFS_IP(new_dir)->commit_mutex, COMMIT_MUTEX_PARENT); mutex_lock_nested(&JFS_IP(old_ip)->commit_mutex, COMMIT_MUTEX_CHILD); if (old_dir != new_dir) mutex_lock_nested(&JFS_IP(old_dir)->commit_mutex, COMMIT_MUTEX_SECOND_PARENT); if (new_ip) { mutex_lock_nested(&JFS_IP(new_ip)->commit_mutex, COMMIT_MUTEX_VICTIM); /* * Change existing directory entry to new inode number */ ino = new_ip->i_ino; rc = dtModify(tid, new_dir, &new_dname, &ino, old_ip->i_ino, JFS_RENAME); if (rc) goto out_tx; drop_nlink(new_ip); if (S_ISDIR(new_ip->i_mode)) { drop_nlink(new_ip); if (new_ip->i_nlink) { mutex_unlock(&JFS_IP(new_ip)->commit_mutex); if (old_dir != new_dir) mutex_unlock(&JFS_IP(old_dir)->commit_mutex); mutex_unlock(&JFS_IP(old_ip)->commit_mutex); mutex_unlock(&JFS_IP(new_dir)->commit_mutex); if (!S_ISDIR(old_ip->i_mode) && new_ip) IWRITE_UNLOCK(new_ip); jfs_error(new_ip->i_sb, "new_ip->i_nlink != 0\n"); return -EIO; } tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_DELETE; tblk->u.ip = new_ip; } else if (new_ip->i_nlink == 0) { assert(!test_cflag(COMMIT_Nolink, new_ip)); /* free block resources */ if ((new_size = commitZeroLink(tid, new_ip)) < 0) { txAbort(tid, 1); /* Marks FS Dirty */ rc = new_size; goto out_tx; } tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_DELETE; tblk->u.ip = new_ip; } else { inode_set_ctime_current(new_ip); mark_inode_dirty(new_ip); } } else { /* * Add new directory entry */ rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_CREATE); if (rc) { jfs_err("jfs_rename didn't expect dtSearch to fail w/rc = %d", rc); goto out_tx; } ino = old_ip->i_ino; rc = dtInsert(tid, new_dir, &new_dname, &ino, &btstack); if (rc) { if (rc == -EIO) jfs_err("jfs_rename: dtInsert returned -EIO"); goto out_tx; } if (S_ISDIR(old_ip->i_mode)) inc_nlink(new_dir); } /* * Remove old directory entry */ ino = old_ip->i_ino; rc = dtDelete(tid, old_dir, &old_dname, &ino, JFS_REMOVE); if (rc) { jfs_err("jfs_rename did not expect dtDelete to return rc = %d", rc); txAbort(tid, 1); /* Marks Filesystem dirty */ goto out_tx; } if (S_ISDIR(old_ip->i_mode)) { drop_nlink(old_dir); if (old_dir != new_dir) { /* * Change inode number of parent for moved directory */ JFS_IP(old_ip)->i_dtroot.header.idotdot = cpu_to_le32(new_dir->i_ino); /* Linelock header of dtree */ tlck = txLock(tid, old_ip, (struct metapage *) &JFS_IP(old_ip)->bxflag, tlckDTREE | tlckBTROOT | tlckRELINK); dtlck = (struct dt_lock *) & tlck->lock; ASSERT(dtlck->index == 0); lv = & dtlck->lv[0]; lv->offset = 0; lv->length = 1; dtlck->index++; } } /* * Update ctime on changed/moved inodes & mark dirty */ inode_set_ctime_current(old_ip); mark_inode_dirty(old_ip); inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir)); mark_inode_dirty(new_dir); /* Build list of inodes modified by this transaction */ ipcount = 0; iplist[ipcount++] = old_ip; if (new_ip) iplist[ipcount++] = new_ip; iplist[ipcount++] = old_dir; if (old_dir != new_dir) { iplist[ipcount++] = new_dir; inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); mark_inode_dirty(old_dir); } /* * Incomplete truncate of file data can * result in timing problems unless we synchronously commit the * transaction. */ if (new_size) commit_flag = COMMIT_SYNC; else commit_flag = 0; rc = txCommit(tid, ipcount, iplist, commit_flag); out_tx: txEnd(tid); if (new_ip) mutex_unlock(&JFS_IP(new_ip)->commit_mutex); if (old_dir != new_dir) mutex_unlock(&JFS_IP(old_dir)->commit_mutex); mutex_unlock(&JFS_IP(old_ip)->commit_mutex); mutex_unlock(&JFS_IP(new_dir)->commit_mutex); while (new_size && (rc == 0)) { tid = txBegin(new_ip->i_sb, 0); mutex_lock(&JFS_IP(new_ip)->commit_mutex); new_size = xtTruncate_pmap(tid, new_ip, new_size); if (new_size < 0) { txAbort(tid, 1); rc = new_size; } else rc = txCommit(tid, 1, &new_ip, COMMIT_SYNC); txEnd(tid); mutex_unlock(&JFS_IP(new_ip)->commit_mutex); } if (new_ip && (new_ip->i_nlink == 0)) set_cflag(COMMIT_Nolink, new_ip); /* * Truncating the directory index table is not guaranteed. It * may need to be done iteratively */ if (test_cflag(COMMIT_Stale, old_dir)) { if (old_dir->i_size > 1) jfs_truncate_nolock(old_dir, 0); clear_cflag(COMMIT_Stale, old_dir); } out_unlock: if (new_ip && !S_ISDIR(new_ip->i_mode)) IWRITE_UNLOCK(new_ip); out3: free_UCSname(&new_dname); out2: free_UCSname(&old_dname); out1: jfs_info("jfs_rename: returning %d", rc); return rc; } /* * NAME: jfs_mknod * * FUNCTION: Create a special file (device) */ static int jfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct jfs_inode_info *jfs_ip; struct btstack btstack; struct component_name dname; ino_t ino; struct inode *ip; struct inode *iplist[2]; int rc; tid_t tid; struct tblock *tblk; jfs_info("jfs_mknod: %pd", dentry); rc = dquot_initialize(dir); if (rc) goto out; if ((rc = get_UCSname(&dname, dentry))) goto out; ip = ialloc(dir, mode); if (IS_ERR(ip)) { rc = PTR_ERR(ip); goto out1; } jfs_ip = JFS_IP(ip); tid = txBegin(dir->i_sb, 0); mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_acl(tid, ip, dir); if (rc) goto out3; rc = jfs_init_security(tid, ip, dir, &dentry->d_name); if (rc) { txAbort(tid, 0); goto out3; } if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) { txAbort(tid, 0); goto out3; } tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_CREATE; tblk->ino = ip->i_ino; tblk->u.ixpxd = JFS_IP(ip)->ixpxd; ino = ip->i_ino; if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) { txAbort(tid, 0); goto out3; } ip->i_op = &jfs_file_inode_operations; jfs_ip->dev = new_encode_dev(rdev); init_special_inode(ip, ip->i_mode, rdev); mark_inode_dirty(ip); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); iplist[0] = dir; iplist[1] = ip; rc = txCommit(tid, 2, iplist, 0); out3: txEnd(tid); mutex_unlock(&JFS_IP(ip)->commit_mutex); mutex_unlock(&JFS_IP(dir)->commit_mutex); if (rc) { free_ea_wmap(ip); clear_nlink(ip); discard_new_inode(ip); } else { d_instantiate_new(dentry, ip); } out1: free_UCSname(&dname); out: jfs_info("jfs_mknod: returning %d", rc); return rc; } static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) { struct btstack btstack; ino_t inum; struct inode *ip; struct component_name key; int rc; jfs_info("jfs_lookup: name = %pd", dentry); if ((rc = get_UCSname(&key, dentry))) return ERR_PTR(rc); rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP); free_UCSname(&key); if (rc == -ENOENT) { ip = NULL; } else if (rc) { jfs_err("jfs_lookup: dtSearch returned %d", rc); ip = ERR_PTR(rc); } else { ip = jfs_iget(dip->i_sb, inum); if (IS_ERR(ip)) jfs_err("jfs_lookup: iget failed on inum %d", (uint)inum); } return d_splice_alias(ip, dentry); } static struct inode *jfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct inode *inode; if (ino == 0) return ERR_PTR(-ESTALE); inode = jfs_iget(sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); return ERR_PTR(-ESTALE); } return inode; } struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, jfs_nfs_get_inode); } struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, jfs_nfs_get_inode); } struct dentry *jfs_get_parent(struct dentry *dentry) { unsigned long parent_ino; parent_ino = le32_to_cpu(JFS_IP(d_inode(dentry))->i_dtroot.header.idotdot); return d_obtain_alias(jfs_iget(dentry->d_sb, parent_ino)); } const struct inode_operations jfs_dir_inode_operations = { .create = jfs_create, .lookup = jfs_lookup, .link = jfs_link, .unlink = jfs_unlink, .symlink = jfs_symlink, .mkdir = jfs_mkdir, .rmdir = jfs_rmdir, .mknod = jfs_mknod, .rename = jfs_rename, .listxattr = jfs_listxattr, .setattr = jfs_setattr, .fileattr_get = jfs_fileattr_get, .fileattr_set = jfs_fileattr_set, #ifdef CONFIG_JFS_POSIX_ACL .get_inode_acl = jfs_get_acl, .set_acl = jfs_set_acl, #endif }; WRAP_DIR_ITER(jfs_readdir) // FIXME! const struct file_operations jfs_dir_operations = { .read = generic_read_dir, .iterate_shared = shared_jfs_readdir, .fsync = jfs_fsync, .unlocked_ioctl = jfs_ioctl, .compat_ioctl = compat_ptr_ioctl, .llseek = generic_file_llseek, }; static int jfs_ci_hash(const struct dentry *dir, struct qstr *this) { unsigned long hash; int i; hash = init_name_hash(dir); for (i=0; i < this->len; i++) hash = partial_name_hash(tolower(this->name[i]), hash); this->hash = end_name_hash(hash); return 0; } static int jfs_ci_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { int i, result = 1; if (len != name->len) goto out; for (i=0; i < len; i++) { if (tolower(str[i]) != tolower(name->name[i])) goto out; } result = 0; out: return result; } static int jfs_ci_revalidate(struct dentry *dentry, unsigned int flags) { /* * This is not negative dentry. Always valid. * * Note, rename() to existing directory entry will have ->d_inode, * and will use existing name which isn't specified name by user. * * We may be able to drop this positive dentry here. But dropping * positive dentry isn't good idea. So it's unsupported like * rename("filename", "FILENAME") for now. */ if (d_really_is_positive(dentry)) return 1; /* * This may be nfsd (or something), anyway, we can't see the * intent of this. So, since this can be for creation, drop it. */ if (!flags) return 0; /* * Drop the negative dentry, in order to make sure to use the * case sensitive name which is specified by user if this is * for creation. */ if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) return 0; return 1; } const struct dentry_operations jfs_ci_dentry_operations = { .d_hash = jfs_ci_hash, .d_compare = jfs_ci_compare, .d_revalidate = jfs_ci_revalidate, };
10 1 1 1 1 1 1 1 2 1 2 10 1 1 1 1 1 1 1 1 1 1 1 6 4 4 1 5 1 1 1 1 1 1 2 1 2 1 2 3 1 1 1 1 2 1 1 1 2 1 1 1 1 5 1 1 1 1 1 966 896 1 4 4 1 10 1 2 1 1 1 1 1 1 1 1 1 2 1 2 1 11 5 3 4 5 3 3 2 2 1000 5 2 12 2 3 2 1 5 2 6 79 888 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 // SPDX-License-Identifier: GPL-2.0 #include <linux/capability.h> #include <linux/compat.h> #include <linux/blkdev.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/blkpg.h> #include <linux/hdreg.h> #include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/blktrace_api.h> #include <linux/pr.h> #include <linux/uaccess.h> #include <linux/pagemap.h> #include <linux/io_uring/cmd.h> #include <uapi/linux/blkdev.h> #include "blk.h" static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) { struct gendisk *disk = bdev->bd_disk; struct blkpg_partition p; sector_t start, length, capacity, end; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (copy_from_user(&p, upart, sizeof(struct blkpg_partition))) return -EFAULT; if (bdev_is_partition(bdev)) return -EINVAL; if (p.pno <= 0) return -EINVAL; if (op == BLKPG_DEL_PARTITION) return bdev_del_partition(disk, p.pno); if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start) return -EINVAL; /* Check that the partition is aligned to the block size */ if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev))) return -EINVAL; start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; capacity = get_capacity(disk); if (check_add_overflow(start, length, &end)) return -EINVAL; if (start >= capacity || end > capacity) return -EINVAL; switch (op) { case BLKPG_ADD_PARTITION: return bdev_add_partition(disk, p.pno, start, length); case BLKPG_RESIZE_PARTITION: return bdev_resize_partition(disk, p.pno, start, length); default: return -EINVAL; } } static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg) { struct blkpg_partition __user *udata; int op; if (get_user(op, &arg->op) || get_user(udata, &arg->data)) return -EFAULT; return blkpg_do_ioctl(bdev, udata, op); } #ifdef CONFIG_COMPAT struct compat_blkpg_ioctl_arg { compat_int_t op; compat_int_t flags; compat_int_t datalen; compat_caddr_t data; }; static int compat_blkpg_ioctl(struct block_device *bdev, struct compat_blkpg_ioctl_arg __user *arg) { compat_caddr_t udata; int op; if (get_user(op, &arg->op) || get_user(udata, &arg->data)) return -EFAULT; return blkpg_do_ioctl(bdev, compat_ptr(udata), op); } #endif /* * Check that [start, start + len) is a valid range from the block device's * perspective, including verifying that it can be correctly translated into * logical block addresses. */ static int blk_validate_byte_range(struct block_device *bdev, uint64_t start, uint64_t len) { unsigned int bs_mask = bdev_logical_block_size(bdev) - 1; uint64_t end; if ((start | len) & bs_mask) return -EINVAL; if (!len) return -EINVAL; if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev)) return -EINVAL; return 0; } static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { uint64_t range[2], start, len; struct bio *prev = NULL, *bio; sector_t sector, nr_sects; struct blk_plug plug; int err; if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; start = range[0]; len = range[1]; if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (bdev_read_only(bdev)) return -EPERM; err = blk_validate_byte_range(bdev, start, len); if (err) return err; filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (err) goto fail; sector = start >> SECTOR_SHIFT; nr_sects = len >> SECTOR_SHIFT; blk_start_plug(&plug); while (1) { if (fatal_signal_pending(current)) { if (prev) bio_await_chain(prev); err = -EINTR; goto out_unplug; } bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects, GFP_KERNEL); if (!bio) break; prev = bio_chain_and_submit(prev, bio); } if (prev) { err = submit_bio_wait(prev); if (err == -EOPNOTSUPP) err = 0; bio_put(prev); } out_unplug: blk_finish_plug(&plug); fail: filemap_invalidate_unlock(bdev->bd_mapping); return err; } static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, void __user *argp) { uint64_t start, len, end; uint64_t range[2]; int err; if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (!bdev_max_secure_erase_sectors(bdev)) return -EOPNOTSUPP; if (copy_from_user(range, argp, sizeof(range))) return -EFAULT; start = range[0]; len = range[1]; if ((start & 511) || (len & 511)) return -EINVAL; if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev)) return -EINVAL; filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, end - 1); if (!err) err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, GFP_KERNEL); filemap_invalidate_unlock(bdev->bd_mapping); return err; } static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { uint64_t range[2]; uint64_t start, end, len; int err; if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; start = range[0]; len = range[1]; end = start + len - 1; if (start & 511) return -EINVAL; if (len & 511) return -EINVAL; if (end >= (uint64_t)bdev_nr_bytes(bdev)) return -EINVAL; if (end < start) return -EINVAL; /* Invalidate the page cache, including dirty pages */ filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, end); if (err) goto fail; err = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, BLKDEV_ZERO_NOUNMAP | BLKDEV_ZERO_KILLABLE); fail: filemap_invalidate_unlock(bdev->bd_mapping); return err; } static int put_ushort(unsigned short __user *argp, unsigned short val) { return put_user(val, argp); } static int put_int(int __user *argp, int val) { return put_user(val, argp); } static int put_uint(unsigned int __user *argp, unsigned int val) { return put_user(val, argp); } static int put_long(long __user *argp, long val) { return put_user(val, argp); } static int put_ulong(unsigned long __user *argp, unsigned long val) { return put_user(val, argp); } static int put_u64(u64 __user *argp, u64 val) { return put_user(val, argp); } #ifdef CONFIG_COMPAT static int compat_put_long(compat_long_t __user *argp, long val) { return put_user(val, argp); } static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val) { return put_user(val, argp); } #endif #ifdef CONFIG_COMPAT /* * This is the equivalent of compat_ptr_ioctl(), to be used by block * drivers that implement only commands that are completely compatible * between 32-bit and 64-bit user space */ int blkdev_compat_ptr_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; if (disk->fops->ioctl) return disk->fops->ioctl(bdev, mode, cmd, (unsigned long)compat_ptr(arg)); return -ENOIOCTLCMD; } EXPORT_SYMBOL(blkdev_compat_ptr_ioctl); #endif static bool blkdev_pr_allowed(struct block_device *bdev, blk_mode_t mode) { /* no sense to make reservations for partitions */ if (bdev_is_partition(bdev)) return false; if (capable(CAP_SYS_ADMIN)) return true; /* * Only allow unprivileged reservations if the file descriptor is open * for writing. */ return mode & BLK_OPEN_WRITE; } static int blkdev_pr_register(struct block_device *bdev, blk_mode_t mode, struct pr_registration __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_registration reg; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_register) return -EOPNOTSUPP; if (copy_from_user(&reg, arg, sizeof(reg))) return -EFAULT; if (reg.flags & ~PR_FL_IGNORE_KEY) return -EOPNOTSUPP; return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags); } static int blkdev_pr_reserve(struct block_device *bdev, blk_mode_t mode, struct pr_reservation __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_reserve) return -EOPNOTSUPP; if (copy_from_user(&rsv, arg, sizeof(rsv))) return -EFAULT; if (rsv.flags & ~PR_FL_IGNORE_KEY) return -EOPNOTSUPP; return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags); } static int blkdev_pr_release(struct block_device *bdev, blk_mode_t mode, struct pr_reservation __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_release) return -EOPNOTSUPP; if (copy_from_user(&rsv, arg, sizeof(rsv))) return -EFAULT; if (rsv.flags) return -EOPNOTSUPP; return ops->pr_release(bdev, rsv.key, rsv.type); } static int blkdev_pr_preempt(struct block_device *bdev, blk_mode_t mode, struct pr_preempt __user *arg, bool abort) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_preempt p; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_preempt) return -EOPNOTSUPP; if (copy_from_user(&p, arg, sizeof(p))) return -EFAULT; if (p.flags) return -EOPNOTSUPP; return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort); } static int blkdev_pr_clear(struct block_device *bdev, blk_mode_t mode, struct pr_clear __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_clear c; if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_clear) return -EOPNOTSUPP; if (copy_from_user(&c, arg, sizeof(c))) return -EFAULT; if (c.flags) return -EOPNOTSUPP; return ops->pr_clear(bdev, c.key); } static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd, unsigned long arg) { if (!capable(CAP_SYS_ADMIN)) return -EACCES; mutex_lock(&bdev->bd_holder_lock); if (bdev->bd_holder_ops && bdev->bd_holder_ops->sync) bdev->bd_holder_ops->sync(bdev); else { mutex_unlock(&bdev->bd_holder_lock); sync_blockdev(bdev); } invalidate_bdev(bdev); return 0; } static int blkdev_roset(struct block_device *bdev, unsigned cmd, unsigned long arg) { int ret, n; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (get_user(n, (int __user *)arg)) return -EFAULT; if (bdev->bd_disk->fops->set_read_only) { ret = bdev->bd_disk->fops->set_read_only(bdev, n); if (ret) return ret; } if (n) bdev_set_flag(bdev, BD_READ_ONLY); else bdev_clear_flag(bdev, BD_READ_ONLY); return 0; } static int blkdev_getgeo(struct block_device *bdev, struct hd_geometry __user *argp) { struct gendisk *disk = bdev->bd_disk; struct hd_geometry geo; int ret; if (!argp) return -EINVAL; if (!disk->fops->getgeo) return -ENOTTY; /* * We need to set the startsect first, the driver may * want to override it. */ memset(&geo, 0, sizeof(geo)); geo.start = get_start_sect(bdev); ret = disk->fops->getgeo(bdev, &geo); if (ret) return ret; if (copy_to_user(argp, &geo, sizeof(geo))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT struct compat_hd_geometry { unsigned char heads; unsigned char sectors; unsigned short cylinders; u32 start; }; static int compat_hdio_getgeo(struct block_device *bdev, struct compat_hd_geometry __user *ugeo) { struct gendisk *disk = bdev->bd_disk; struct hd_geometry geo; int ret; if (!ugeo) return -EINVAL; if (!disk->fops->getgeo) return -ENOTTY; memset(&geo, 0, sizeof(geo)); /* * We need to set the startsect first, the driver may * want to override it. */ geo.start = get_start_sect(bdev); ret = disk->fops->getgeo(bdev, &geo); if (ret) return ret; ret = copy_to_user(ugeo, &geo, 4); ret |= put_user(geo.start, &ugeo->start); if (ret) ret = -EFAULT; return ret; } #endif /* set the logical block size */ static int blkdev_bszset(struct file *file, blk_mode_t mode, int __user *argp) { // this one might be file_inode(file)->i_rdev - a rare valid // use of file_inode() for those. dev_t dev = I_BDEV(file->f_mapping->host)->bd_dev; struct file *excl_file; int ret, n; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (!argp) return -EINVAL; if (get_user(n, argp)) return -EFAULT; if (mode & BLK_OPEN_EXCL) return set_blocksize(file, n); excl_file = bdev_file_open_by_dev(dev, mode, &dev, NULL); if (IS_ERR(excl_file)) return -EBUSY; ret = set_blocksize(excl_file, n); fput(excl_file); return ret; } /* * Common commands that are handled the same way on native and compat * user space. Note the separate arg/argp parameters that are needed * to deal with the compat_ptr() conversion. */ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg, void __user *argp) { unsigned int max_sectors; switch (cmd) { case BLKFLSBUF: return blkdev_flushbuf(bdev, cmd, arg); case BLKROSET: return blkdev_roset(bdev, cmd, arg); case BLKDISCARD: return blk_ioctl_discard(bdev, mode, arg); case BLKSECDISCARD: return blk_ioctl_secure_erase(bdev, mode, argp); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); case BLKGETDISKSEQ: return put_u64(argp, bdev->bd_disk->diskseq); case BLKREPORTZONE: return blkdev_report_zones_ioctl(bdev, cmd, arg); case BLKRESETZONE: case BLKOPENZONE: case BLKCLOSEZONE: case BLKFINISHZONE: return blkdev_zone_mgmt_ioctl(bdev, mode, cmd, arg); case BLKGETZONESZ: return put_uint(argp, bdev_zone_sectors(bdev)); case BLKGETNRZONES: return put_uint(argp, bdev_nr_zones(bdev)); case BLKROGET: return put_int(argp, bdev_read_only(bdev) != 0); case BLKSSZGET: /* get block device logical block size */ return put_int(argp, bdev_logical_block_size(bdev)); case BLKPBSZGET: /* get block device physical block size */ return put_uint(argp, bdev_physical_block_size(bdev)); case BLKIOMIN: return put_uint(argp, bdev_io_min(bdev)); case BLKIOOPT: return put_uint(argp, bdev_io_opt(bdev)); case BLKALIGNOFF: return put_int(argp, bdev_alignment_offset(bdev)); case BLKDISCARDZEROES: return put_uint(argp, 0); case BLKSECTGET: max_sectors = min_t(unsigned int, USHRT_MAX, queue_max_sectors(bdev_get_queue(bdev))); return put_ushort(argp, max_sectors); case BLKROTATIONAL: return put_ushort(argp, !bdev_nonrot(bdev)); case BLKRASET: case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) return -EACCES; bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKRRPART: if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (bdev_is_partition(bdev)) return -EINVAL; return disk_scan_partitions(bdev->bd_disk, mode | BLK_OPEN_STRICT_SCAN); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); case IOC_PR_REGISTER: return blkdev_pr_register(bdev, mode, argp); case IOC_PR_RESERVE: return blkdev_pr_reserve(bdev, mode, argp); case IOC_PR_RELEASE: return blkdev_pr_release(bdev, mode, argp); case IOC_PR_PREEMPT: return blkdev_pr_preempt(bdev, mode, argp, false); case IOC_PR_PREEMPT_ABORT: return blkdev_pr_preempt(bdev, mode, argp, true); case IOC_PR_CLEAR: return blkdev_pr_clear(bdev, mode, argp); default: return -ENOIOCTLCMD; } } /* * Always keep this in sync with compat_blkdev_ioctl() * to handle all incompatible commands in both functions. * * New commands must be compatible and go into blkdev_common_ioctl */ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct block_device *bdev = I_BDEV(file->f_mapping->host); void __user *argp = (void __user *)arg; blk_mode_t mode = file_to_blk_mode(file); int ret; switch (cmd) { /* These need separate implementations for the data structure */ case HDIO_GETGEO: return blkdev_getgeo(bdev, argp); case BLKPG: return blkpg_ioctl(bdev, argp); /* Compat mode returns 32-bit data instead of 'long' */ case BLKRAGET: case BLKFRAGET: if (!argp) return -EINVAL; return put_long(argp, (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: if (bdev_nr_sectors(bdev) > ~0UL) return -EFBIG; return put_ulong(argp, bdev_nr_sectors(bdev)); /* The data is compatible, but the command number is different */ case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ return put_int(argp, block_size(bdev)); case BLKBSZSET: return blkdev_bszset(file, mode, argp); case BLKGETSIZE64: return put_u64(argp, bdev_nr_bytes(bdev)); /* Incompatible alignment on i386 */ case BLKTRACESETUP: return blk_trace_ioctl(bdev, cmd, argp); default: break; } ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); if (ret != -ENOIOCTLCMD) return ret; if (!bdev->bd_disk->fops->ioctl) return -ENOTTY; return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); } #ifdef CONFIG_COMPAT #define BLKBSZGET_32 _IOR(0x12, 112, int) #define BLKBSZSET_32 _IOW(0x12, 113, int) #define BLKGETSIZE64_32 _IOR(0x12, 114, int) /* Most of the generic ioctls are handled in the normal fallback path. This assumes the blkdev's low level compat_ioctl always returns ENOIOCTLCMD for unknown ioctls. */ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { int ret; void __user *argp = compat_ptr(arg); struct block_device *bdev = I_BDEV(file->f_mapping->host); struct gendisk *disk = bdev->bd_disk; blk_mode_t mode = file_to_blk_mode(file); switch (cmd) { /* These need separate implementations for the data structure */ case HDIO_GETGEO: return compat_hdio_getgeo(bdev, argp); case BLKPG: return compat_blkpg_ioctl(bdev, argp); /* Compat mode returns 32-bit data instead of 'long' */ case BLKRAGET: case BLKFRAGET: if (!argp) return -EINVAL; return compat_put_long(argp, (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: if (bdev_nr_sectors(bdev) > ~(compat_ulong_t)0) return -EFBIG; return compat_put_ulong(argp, bdev_nr_sectors(bdev)); /* The data is compatible, but the command number is different */ case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ return put_int(argp, bdev_logical_block_size(bdev)); case BLKBSZSET_32: return blkdev_bszset(file, mode, argp); case BLKGETSIZE64_32: return put_u64(argp, bdev_nr_bytes(bdev)); /* Incompatible alignment on i386 */ case BLKTRACESETUP32: return blk_trace_ioctl(bdev, cmd, argp); default: break; } ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl) ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg); return ret; } #endif struct blk_iou_cmd { int res; bool nowait; }; static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); if (bic->res == -EAGAIN && bic->nowait) io_uring_cmd_issue_blocking(cmd); else io_uring_cmd_done(cmd, bic->res, 0, issue_flags); } static void bio_cmd_bio_end_io(struct bio *bio) { struct io_uring_cmd *cmd = bio->bi_private; struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); if (unlikely(bio->bi_status) && !bic->res) bic->res = blk_status_to_errno(bio->bi_status); io_uring_cmd_do_in_task_lazy(cmd, blk_cmd_complete); bio_put(bio); } static int blkdev_cmd_discard(struct io_uring_cmd *cmd, struct block_device *bdev, uint64_t start, uint64_t len, bool nowait) { struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); gfp_t gfp = nowait ? GFP_NOWAIT : GFP_KERNEL; sector_t sector = start >> SECTOR_SHIFT; sector_t nr_sects = len >> SECTOR_SHIFT; struct bio *prev = NULL, *bio; int err; if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; if (!(file_to_blk_mode(cmd->file) & BLK_OPEN_WRITE)) return -EBADF; if (bdev_read_only(bdev)) return -EPERM; err = blk_validate_byte_range(bdev, start, len); if (err) return err; err = filemap_invalidate_pages(bdev->bd_mapping, start, start + len - 1, nowait); if (err) return err; while (true) { bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects, gfp); if (!bio) break; if (nowait) { /* * Don't allow multi-bio non-blocking submissions as * subsequent bios may fail but we won't get a direct * indication of that. Normally, the caller should * retry from a blocking context. */ if (unlikely(nr_sects)) { bio_put(bio); return -EAGAIN; } bio->bi_opf |= REQ_NOWAIT; } prev = bio_chain_and_submit(prev, bio); } if (unlikely(!prev)) return -EAGAIN; if (unlikely(nr_sects)) bic->res = -EAGAIN; prev->bi_private = cmd; prev->bi_end_io = bio_cmd_bio_end_io; submit_bio(prev); return -EIOCBQUEUED; } int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) { struct block_device *bdev = I_BDEV(cmd->file->f_mapping->host); struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd); const struct io_uring_sqe *sqe = cmd->sqe; u32 cmd_op = cmd->cmd_op; uint64_t start, len; if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len || sqe->rw_flags || sqe->file_index)) return -EINVAL; bic->res = 0; bic->nowait = issue_flags & IO_URING_F_NONBLOCK; start = READ_ONCE(sqe->addr); len = READ_ONCE(sqe->addr3); switch (cmd_op) { case BLOCK_URING_CMD_DISCARD: return blkdev_cmd_discard(cmd, bdev, start, len, bic->nowait); } return -EINVAL; }
13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 // SPDX-License-Identifier: MIT #include <drm/drm_client_setup.h> #include <drm/drm_device.h> #include <drm/drm_fbdev_client.h> #include <drm/drm_fourcc.h> #include <drm/drm_print.h> /** * drm_client_setup() - Setup in-kernel DRM clients * @dev: DRM device * @format: Preferred pixel format for the device. Use NULL, unless * there is clearly a driver-preferred format. * * This function sets up the in-kernel DRM clients. Restore, hotplug * events and teardown are all taken care of. * * Drivers should call drm_client_setup() after registering the new * DRM device with drm_dev_register(). This function is safe to call * even when there are no connectors present. Setup will be retried * on the next hotplug event. * * The clients are destroyed by drm_dev_unregister(). */ void drm_client_setup(struct drm_device *dev, const struct drm_format_info *format) { int ret; ret = drm_fbdev_client_setup(dev, format); if (ret) drm_warn(dev, "Failed to set up DRM client; error %d\n", ret); } EXPORT_SYMBOL(drm_client_setup); /** * drm_client_setup_with_fourcc() - Setup in-kernel DRM clients for color mode * @dev: DRM device * @fourcc: Preferred pixel format as 4CC code for the device * * This function sets up the in-kernel DRM clients. It is equivalent * to drm_client_setup(), but expects a 4CC code as second argument. */ void drm_client_setup_with_fourcc(struct drm_device *dev, u32 fourcc) { drm_client_setup(dev, drm_format_info(fourcc)); } EXPORT_SYMBOL(drm_client_setup_with_fourcc); /** * drm_client_setup_with_color_mode() - Setup in-kernel DRM clients for color mode * @dev: DRM device * @color_mode: Preferred color mode for the device * * This function sets up the in-kernel DRM clients. It is equivalent * to drm_client_setup(), but expects a color mode as second argument. * * Do not use this function in new drivers. Prefer drm_client_setup() with a * format of NULL. */ void drm_client_setup_with_color_mode(struct drm_device *dev, unsigned int color_mode) { u32 fourcc = drm_driver_color_mode_format(dev, color_mode); drm_client_setup_with_fourcc(dev, fourcc); } EXPORT_SYMBOL(drm_client_setup_with_color_mode); MODULE_DESCRIPTION("In-kernel DRM clients"); MODULE_LICENSE("GPL and additional rights");
122 8179 11 3271 7841 389 5389 515 8168 7427 138 110 110 36 110 110 110 110 110 110 93 19 3125 110 3303 3304 491 149 118 2755 135 3116 3 1 28 4 15 28 28 28 27 229 1 8171 8168 8183 7401 35 7427 98 8241 8157 7427 7385 53 7451 7609 4672 8213 12 8166 8174 8214 3304 8298 8348 3 7380 986 1134 14 13 28 27 5379 5373 16 16 13 13 3533 8328 7399 4613 337 501 502 6 12 148 1 147 146 147 139 3 6 2 139 139 1 7 11 1 1 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * High-resolution kernel timers * * In contrast to the low-resolution timeout API, aka timer wheel, * hrtimers provide finer resolution and accuracy depending on system * configuration and capabilities. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: * Based on the original timer wheel code * * Help, testing, suggestions, bugfixes, improvements were * provided by: * * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel * et. al. */ #include <linux/cpu.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/hrtimer.h> #include <linux/notifier.h> #include <linux/syscalls.h> #include <linux/interrupt.h> #include <linux/tick.h> #include <linux/err.h> #include <linux/debugobjects.h> #include <linux/sched/signal.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/sched/deadline.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> #include <linux/sched/isolation.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <trace/events/timer.h> #include "tick-internal.h" /* * Masks for selecting the soft and hard context timers from * cpu_base->active */ #define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) #define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) /* * The timer bases: * * There are more clockids than hrtimer bases. Thus, we index * into the timer bases by the hrtimer_base_type enum. When trying * to reach a base using a clockid, hrtimer_clockid_to_base() * is used to convert from clockid to the proper hrtimer_base_type. */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, } }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { /* Make sure we catch unsupported clockids */ [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; /* * Functions and macros which are different for UP/SMP systems are kept in a * single place */ #ifdef CONFIG_SMP /* * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() * such that hrtimer_callback_running() can unconditionally dereference * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { .clock_base = { { .cpu_base = &migration_cpu_base, .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, &migration_cpu_base.lock), }, }, }; #define migration_base migration_cpu_base.clock_base[0] static inline bool is_migration_base(struct hrtimer_clock_base *base) { return base == &migration_base; } /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could * be found on the lists/queues. * * When the timer's base is locked, and the timer removed from list, it is * possible to set timer->base = &migration_base and drop the lock: the timer * remains locked. */ static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->lock) { struct hrtimer_clock_base *base; for (;;) { base = READ_ONCE(timer->base); if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } } /* * We do not migrate the timer when it is expiring before the next * event on the target cpu. When high resolution is enabled, we cannot * reprogram the target cpu hardware and we would cause it to fire * late. To keep it simple, we handle the high resolution enabled and * disabled case similar. * * Called with cpu_base->lock of target cpu held. */ static int hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) { ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); return expires < new_base->cpu_base->expires_next; } static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); #endif return base; } /* * We switch the timer base to a power-optimized selected CPU target, * if: * - NO_HZ_COMMON is enabled * - timer migration is enabled * - the timer callback is not running * - the timer is not the first expiring timer on the new target * * If one of the above requirements is not fulfilled we move the timer * to the current CPU or leave it on the previously assigned CPU if * the timer callback is currently running. */ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; struct hrtimer_clock_base *new_base; int basenum = base->index; this_cpu_base = this_cpu_ptr(&hrtimer_bases); new_cpu_base = get_target_base(this_cpu_base, pinned); again: new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { /* * We are trying to move timer to new_base. * However we can't change timer's base while it is running, * so we keep it on the same CPU. No hassle vs. reprogramming * the event source in the high resolution case. The softirq * code will take care of this when the timer function has * completed. There is no conflict as we hold the lock until * the timer is enqueued. */ if (unlikely(hrtimer_callback_running(timer))) return base; /* See the comment in lock_hrtimer_base() */ WRITE_ONCE(timer->base, &migration_base); raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; WRITE_ONCE(timer->base, base); goto again; } WRITE_ONCE(timer->base, new_base); } else { if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { new_cpu_base = this_cpu_base; goto again; } } return new_base; } #else /* CONFIG_SMP */ static inline bool is_migration_base(struct hrtimer_clock_base *base) { return false; } static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->cpu_base->lock) { struct hrtimer_clock_base *base = timer->base; raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } # define switch_hrtimer_base(t, b, p) (b) #endif /* !CONFIG_SMP */ /* * Functions for the union type storage format of ktime_t which are * too large for inlining: */ #if BITS_PER_LONG < 64 /* * Divide a ktime value by a nanosecond value */ s64 __ktime_divns(const ktime_t kt, s64 div) { int sft = 0; s64 dclc; u64 tmp; dclc = ktime_to_ns(kt); tmp = dclc < 0 ? -dclc : dclc; /* Make sure the divisor is less than 2^32: */ while (div >> 32) { sft++; div >>= 1; } tmp >>= sft; do_div(tmp, (u32) div); return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* * Add two ktime values and do a safety check for overflow: */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can * return to user space in a timespec: */ if (res < 0 || res < lhs || res < rhs) res = ktime_set(KTIME_SEC_MAX, 0); return res; } EXPORT_SYMBOL_GPL(ktime_add_safe); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { return ((struct hrtimer *) addr)->function; } /* * fixup_init is called when: * - an active object is initialized */ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_init(timer, &hrtimer_debug_descr); return true; default: return false; } } /* * fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); fallthrough; default: return false; } } /* * fixup_free is called when: * - an active object is freed */ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_free(timer, &hrtimer_debug_descr); return true; default: return false; } } static const struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, .fixup_activate = hrtimer_fixup_activate, .fixup_free = hrtimer_fixup_free, }; static inline void debug_hrtimer_init(struct hrtimer *timer) { debug_object_init(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { debug_object_init_on_stack(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_object_activate(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { debug_object_deactivate(timer, &hrtimer_debug_descr); } void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); } EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif static inline void debug_init(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); trace_hrtimer_init(timer, clockid, mode); } static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init_on_stack(timer); trace_hrtimer_init(timer, clockid, mode); } static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_hrtimer_activate(timer, mode); trace_hrtimer_start(timer, mode); } static inline void debug_deactivate(struct hrtimer *timer) { debug_hrtimer_deactivate(timer); trace_hrtimer_cancel(timer); } static struct hrtimer_clock_base * __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) { unsigned int idx; if (!*active) return NULL; idx = __ffs(*active); *active &= ~(1U << idx); return &cpu_base->clock_base[idx]; } #define for_each_active_base(base, cpu_base, active) \ while ((base = __next_base((cpu_base), &(active)))) static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { struct hrtimer_clock_base *base; ktime_t expires; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; struct hrtimer *timer; next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); if (timer == exclude) { /* Get to the next timer in the queue. */ next = timerqueue_iterate_next(next); if (!next) continue; timer = container_of(next, struct hrtimer, node); } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; /* Skip cpu_base update if a timer is being excluded. */ if (exclude) continue; if (timer->is_soft) cpu_base->softirq_next_timer = timer; else cpu_base->next_timer = timer; } } /* * clock_was_set() might have changed base->offset of any of * the clock bases so the result might be negative. Fix it up * to prevent a false positive in clockevents_program_event(). */ if (expires_next < 0) expires_next = 0; return expires_next; } /* * Recomputes cpu_base::*next_timer and returns the earliest expires_next * but does not set cpu_base::*expires_next, that is done by * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating * cpu_base::*expires_next right away, reprogramming logic would no longer * work. * * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, * those timers will get run whenever the softirq gets handled, at the end of * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. * * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. * * @active_mask must be one of: * - HRTIMER_ACTIVE_ALL, * - HRTIMER_ACTIVE_SOFT, or * - HRTIMER_ACTIVE_HARD. */ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) { unsigned int active; struct hrtimer *next_timer = NULL; ktime_t expires_next = KTIME_MAX; if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; cpu_base->softirq_next_timer = NULL; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, KTIME_MAX); next_timer = cpu_base->softirq_next_timer; } if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; cpu_base->next_timer = next_timer; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, expires_next); } return expires_next; } static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) { ktime_t expires_next, soft = KTIME_MAX; /* * If the soft interrupt has already been activated, ignore the * soft bases. They will be handled in the already raised soft * interrupt. */ if (!cpu_base->softirq_activated) { soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * Update the soft expiry time. clock_settime() might have * affected it. */ cpu_base->softirq_expires_next = soft; } expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); /* * If a softirq timer is expiring first, update cpu_base->next_timer * and program the hardware with the soft expiry time. */ if (expires_next > soft) { cpu_base->next_timer = cpu_base->softirq_next_timer; expires_next = soft; } return expires_next; } static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, offs_boot, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; return now; } /* * Is the high resolution mode active ? */ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? cpu_base->hres_active : 0; } static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, ktime_t expires_next) { cpu_base->expires_next = expires_next; /* * If hres is not active, hardware does not have to be * reprogrammed yet. * * If a hang was detected in the last timer interrupt then we * leave the hang delay active in the hardware. We want the * system to make progress. That also prevents the following * scenario: * T1 expires 50ms from now * T2 expires 5s from now * * T1 is removed, so this code is called and would reprogram * the hardware to 5s from now. Any hrtimer_start after that * will not reprogram the hardware due to hang_detected being * set. So we'd effectively block all timers until the T2 event * fires. */ if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; tick_program_event(expires_next, 1); } /* * Reprogram the event source with checking both queues for the * next event * Called with interrupts disabled and base->lock held */ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; expires_next = hrtimer_update_next_event(cpu_base); if (skip_equal && expires_next == cpu_base->expires_next) return; __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); } /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer enabled ? */ static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); /* * Enable / Disable high resolution mode */ static int __init setup_hrtimer_hres(char *str) { return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); /* * hrtimer_high_res_enabled - query, if the highres mode is enabled */ static inline int hrtimer_is_hres_enabled(void) { return hrtimer_hres_enabled; } static void retrigger_next_event(void *arg); /* * Switch to high resolution mode */ static void hrtimer_switch_to_hres(void) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); return; } base->hres_active = 1; hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(true); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); } #else static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ /* * Retrigger next event is called after clock was set with interrupts * disabled through an SMP function call or directly from low level * resume code. * * This is only invoked when: * - CONFIG_HIGH_RES_TIMERS is enabled. * - CONFIG_NOHZ_COMMON is enabled * * For the other cases this function is empty and because the call sites * are optimized out it vanishes as well, i.e. no need for lots of * #ifdeffery. */ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); /* * When high resolution mode or nohz is active, then the offsets of * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the * next tick will take care of that. * * If high resolution mode is active then the next expiring timer * must be reevaluated and the clock event device reprogrammed if * necessary. * * In the NOHZ case the update of the offset and the reevaluation * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. */ if (!hrtimer_hres_active(base) && !tick_nohz_active) return; raw_spin_lock(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) hrtimer_force_reprogram(base, 0); else hrtimer_update_next_event(base); raw_spin_unlock(&base->lock); } /* * When a timer is enqueued and expires earlier than the already enqueued * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * * Called with interrupts disabled and base->cpu_base.lock held */ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = timer->base; ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* * CLOCK_REALTIME timer might be requested with an absolute * expiry time which is less than base->offset. Set it to 0. */ if (expires < 0) expires = 0; if (timer->is_soft) { /* * soft hrtimer could be started on a remote CPU. In this * case softirq_expires_next needs to be updated on the * remote CPU. The soft hrtimer will not expire before the * first hard hrtimer on the remote CPU - * hrtimer_check_target() prevents this case. */ struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; if (timer_cpu_base->softirq_activated) return; if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) return; timer_cpu_base->softirq_next_timer = timer; timer_cpu_base->softirq_expires_next = expires; if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram) return; } /* * If the timer is not on the current cpu, we cannot reprogram * the other cpus clock event device. */ if (base->cpu_base != cpu_base) return; if (expires >= cpu_base->expires_next) return; /* * If the hrtimer interrupt is running, then it will reevaluate the * clock bases and reprogram the clock event device. */ if (cpu_base->in_hrtirq) return; cpu_base->next_timer = timer; __hrtimer_reprogram(cpu_base, timer, expires); } static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active) { struct hrtimer_clock_base *base; unsigned int seq; ktime_t expires; /* * Update the base offsets unconditionally so the following * checks whether the SMP function call is required works. * * The update is safe even when the remote CPU is in the hrtimer * interrupt or the hrtimer soft interrupt and expiring affected * bases. Either it will see the update before handling a base or * it will see it when it finishes the processing and reevaluates * the next expiring timer. */ seq = cpu_base->clock_was_set_seq; hrtimer_update_base(cpu_base); /* * If the sequence did not change over the update then the * remote CPU already handled it. */ if (seq == cpu_base->clock_was_set_seq) return false; /* * If the remote CPU is currently handling an hrtimer interrupt, it * will reevaluate the first expiring timer of all clock bases * before reprogramming. Nothing to do here. */ if (cpu_base->in_hrtirq) return false; /* * Walk the affected clock bases and check whether the first expiring * timer in a clock base is moving ahead of the first expiring timer of * @cpu_base. If so, the IPI must be invoked because per CPU clock * event devices cannot be remotely reprogrammed. */ active &= cpu_base->active_bases; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; next = timerqueue_getnext(&base->active); expires = ktime_sub(next->expires, base->offset); if (expires < cpu_base->expires_next) return true; /* Extra check for softirq clock bases */ if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) continue; if (cpu_base->softirq_activated) continue; if (expires < cpu_base->softirq_expires_next) return true; } return false; } /* * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and * CLOCK_BOOTTIME (for late sleep time injection). * * This requires to update the offsets for these clocks * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this * also requires to eventually reprogram the per CPU clock event devices * when the change moves an affected timer ahead of the first expiring * timer on that CPU. Obviously remote per CPU clock event devices cannot * be reprogrammed. The other reason why an IPI has to be sent is when the * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets * in the tick, which obviously might be stopped, so this has to bring out * the remote CPU which might sleep in idle to get this sorted. */ void clock_was_set(unsigned int bases) { struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; int cpu; if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { on_each_cpu(retrigger_next_event, NULL, 1); goto out_timerfd; } /* Avoid interrupting CPUs if possible */ cpus_read_lock(); for_each_online_cpu(cpu) { unsigned long flags; cpu_base = &per_cpu(hrtimer_bases, cpu); raw_spin_lock_irqsave(&cpu_base->lock, flags); if (update_needs_ipi(cpu_base, bases)) cpumask_set_cpu(cpu, mask); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } preempt_disable(); smp_call_function_many(mask, retrigger_next_event, NULL, 1); preempt_enable(); cpus_read_unlock(); free_cpumask_var(mask); out_timerfd: timerfd_clock_was_set(); } static void clock_was_set_work(struct work_struct *work) { clock_was_set(CLOCK_SET_WALL); } static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* * Called from timekeeping code to reprogram the hrtimer interrupt device * on all cpus and to notify timerfd. */ void clock_was_set_delayed(void) { schedule_work(&hrtimer_work); } /* * Called during resume either directly from via timekeeping_resume() * or in the case of s2idle from tick_unfreeze() to ensure that the * hrtimers are up to date. */ void hrtimers_resume_local(void) { lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); } /* * Counterpart to lock_hrtimer_base above: */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __releases(&timer->base->cpu_base->lock) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** * hrtimer_forward() - forward the timer expiry * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward * * Forward the timer expiry so it will expire in the future. * * .. note:: * This only updates the timer expiry value and does not requeue the timer. * * There is also a variant of the function hrtimer_forward_now(). * * Context: Can be safely called from the callback function of @timer. If called * from other contexts @timer must neither be enqueued nor running the * callback and the caller needs to take care of serialization. * * Return: The number of overruns are returned. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { u64 orun = 1; ktime_t delta; delta = ktime_sub(now, hrtimer_get_expires(timer)); if (delta < 0) return 0; if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) return 0; if (interval < hrtimer_resolution) interval = hrtimer_resolution; if (unlikely(delta >= interval)) { s64 incr = ktime_to_ns(interval); orun = ktime_divns(delta, incr); hrtimer_add_expires_ns(timer, incr * orun); if (hrtimer_get_expires_tv64(timer) > now) return orun; /* * This (and the ktime_add() below) is the * correction for exact: */ orun++; } hrtimer_add_expires(timer, interval); return orun; } EXPORT_SYMBOL_GPL(hrtimer_forward); /* * enqueue_hrtimer - internal function to (re)start a timer * * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * * Returns 1 when the new timer is the leftmost timer in the tree. */ static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, enum hrtimer_mode mode) { debug_activate(timer, mode); WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } /* * __remove_hrtimer - internal function to remove a timer * * Caller must hold the base lock. * * High resolution timer mode reprograms the clock event device when the * timer is the one which expires next. The caller can disable this by setting * reprogram to zero. This is useful, when the context does a reprogramming * anyway (e.g. timer interrupt) */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); /* * Note: If reprogram is false we do not update * cpu_base->next_timer. This happens when we remove the first * timer on a remote cpu. No harm as we never dereference * cpu_base->next_timer. So the worst thing what can happen is * an superfluous call to hrtimer_force_reprogram() on the * remote cpu later on if the same timer gets enqueued again. */ if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); } /* * remove hrtimer, called with base lock held */ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart, bool keep_local) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { bool reprogram; /* * Remove the timer and force reprogramming when high * resolution mode is active and the timer is on the current * CPU. If we remove a timer on another CPU, reprogramming is * skipped. The interrupt event on this CPU is fired and * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); /* * If the timer is not restarted then reprogramming is * required if the timer is local. If it is local and about * to be restarted, avoid programming it twice (on removal * and a moment later when it's requeued). */ if (!restart) state = HRTIMER_STATE_INACTIVE; else reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; } return 0; } static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { #ifdef CONFIG_TIME_LOW_RES /* * CONFIG_TIME_LOW_RES indicates that the system has no way to return * granular time values. For relative timers we add hrtimer_resolution * (i.e. one jiffy) to prevent short timeouts. */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) tim = ktime_add_safe(tim, hrtimer_resolution); #endif return tim; } static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) { ktime_t expires; /* * Find the next SOFT expiration. */ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * reprogramming needs to be triggered, even if the next soft * hrtimer expires at the same time than the next hard * hrtimer. cpu_base->softirq_expires_next needs to be updated! */ if (expires == KTIME_MAX) return; /* * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() * cpu_base->*expires_next is only set by hrtimer_reprogram() */ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); } static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; bool force_local, first; /* * If the timer is on the local cpu base and is the first expiring * timer then this might end up reprogramming the hardware twice * (on removal and on enqueue). To avoid that by prevent the * reprogram on removal, keep the timer local to the current CPU * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); force_local &= base->cpu_base->next_timer == timer; /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. * * If it's on the current CPU and the first expiring timer, then * skip reprogramming, keep the timer local and enforce * reprogramming later if it was the first expiring timer. This * avoids programming the underlying clock event twice (once at * removal and once after enqueue). */ remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ if (!force_local) { new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); } else { new_base = base; } first = enqueue_hrtimer(timer, new_base, mode); if (!force_local) return first; /* * Timer was forced to stay on the current CPU to avoid * reprogramming on removal and enqueue. Force reprogram the * hardware by evaluating the new first expiring timer. */ hrtimer_force_reprogram(new_base->cpu_base, 1); return 0; } /** * hrtimer_start_range_ns - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base; unsigned long flags; if (WARN_ON_ONCE(!timer->function)) return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard * expiry mode because unmarked timers are moved to softirq expiry. */ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); else WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); base = lock_hrtimer_base(timer, &flags); if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) hrtimer_reprogram(timer, true); unlock_hrtimer_base(timer, &flags); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop * * Returns: * * * 0 when the timer was not active * * 1 when the timer was active * * -1 when the timer is currently executing the callback function and * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned long flags; int ret = -1; /* * Check lockless first. If the timer is not active (neither * enqueued nor running the callback, nothing to do here. The * base lock does not serialize against a concurrent enqueue, * so we can avoid taking it. */ if (!hrtimer_active(timer)) return 0; base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); return ret; } EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); #ifdef CONFIG_PREEMPT_RT static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { spin_lock_init(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) __acquires(&base->softirq_expiry_lock) { spin_lock(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) __releases(&base->softirq_expiry_lock) { spin_unlock(&base->softirq_expiry_lock); } /* * The counterpart to hrtimer_cancel_wait_running(). * * If there is a waiter for cpu_base->expiry_lock, then it was waiting for * the timer callback to finish. Drop expiry_lock and reacquire it. That * allows the waiter to acquire the lock and make progress. */ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags) { if (atomic_read(&cpu_base->timer_waiters)) { raw_spin_unlock_irqrestore(&cpu_base->lock, flags); spin_unlock(&cpu_base->softirq_expiry_lock); spin_lock(&cpu_base->softirq_expiry_lock); raw_spin_lock_irq(&cpu_base->lock); } } /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion: if the soft irq thread is preempted * in the middle of a timer callback, then calling del_timer_sync() can * lead to two issues: * * - If the caller is on a remote CPU then it has to spin wait for the timer * handler to complete. This can result in unbound priority inversion. * * - If the caller originates from the task which preempted the timer * handler on the same CPU, then spin waiting for the timer handler to * complete is never going to end. */ void hrtimer_cancel_wait_running(const struct hrtimer *timer) { /* Lockless read. Prevent the compiler from reloading it below */ struct hrtimer_clock_base *base = READ_ONCE(timer->base); /* * Just relax if the timer expires in hard interrupt context or if * it is currently on the migration base. */ if (!timer->is_soft || is_migration_base(base)) { cpu_relax(); return; } /* * Mark the base as contended and grab the expiry lock, which is * held by the softirq across the timer callback. Drop the lock * immediately so the softirq can expire the next timer. In theory * the timer could already be running again, but that's more than * unlikely and just causes another wait loop. */ atomic_inc(&base->cpu_base->timer_waiters); spin_lock_bh(&base->cpu_base->softirq_expiry_lock); atomic_dec(&base->cpu_base->timer_waiters); spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); } #else static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long flags) { } #endif /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. * @timer: the timer to be cancelled * * Returns: * 0 when the timer was not active * 1 when the timer was active */ int hrtimer_cancel(struct hrtimer *timer) { int ret; do { ret = hrtimer_try_to_cancel(timer); if (ret < 0) hrtimer_cancel_wait_running(timer); } while (ret < 0); return ret; } EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * __hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) rem = hrtimer_expires_remaining_adjusted(timer); else rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** * hrtimer_get_next_event - get the time until next expiry event * * Returns the next expiry time or KTIME_MAX if no timer is pending. */ u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!hrtimer_hres_active(cpu_base)) expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } /** * hrtimer_next_event_without - time until next expiry event w/o one timer * @exclude: timer to exclude * * Returns the next expiry time over all timers except for the @exclude one or * KTIME_MAX if none of them is pending. */ u64 hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (hrtimer_hres_active(cpu_base)) { unsigned int active; if (!cpu_base->softirq_activated) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; expires = __hrtimer_next_event_base(cpu_base, exclude, active, KTIME_MAX); } active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; expires = __hrtimer_next_event_base(cpu_base, exclude, active, expires); } raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) { if (likely(clock_id < MAX_CLOCKS)) { int base = hrtimer_clock_to_base_table[clock_id]; if (likely(base != HRTIMER_MAX_CLOCK_BASES)) return base; } WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); return HRTIMER_BASE_MONOTONIC; } static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) { return HRTIMER_NORESTART; } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; int base; /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context for latency reasons and because the callbacks * can invoke functions which might sleep on RT, e.g. spin_lock(). */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) softtimer = true; memset(timer, 0, sizeof(struct hrtimer)); cpu_base = raw_cpu_ptr(&hrtimer_bases); /* * POSIX magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they needs to become CLOCK_MONOTONIC to * ensure POSIX compliance. */ if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { __hrtimer_init(timer, clock_id, mode); if (WARN_ON_ONCE(!function)) timer->function = hrtimer_dummy_timeout; else timer->function = function; } /** * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized * @clock_id: the clock to be used * @mode: The modes which are relevant for initialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, * HRTIMER_MODE_REL_SOFT * * The PINNED variants of the above can be handed in, * but the PINNED bit is ignored as pinning happens * when the hrtimer is started */ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { debug_init(timer, clock_id, mode); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init); /** * hrtimer_setup - initialize a timer to the given clock * @timer: the timer to be initialized * @function: the callback function * @clock_id: the clock to be used * @mode: The modes which are relevant for initialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, * HRTIMER_MODE_REL_SOFT * * The PINNED variants of the above can be handed in, * but the PINNED bit is ignored as pinning happens * when the hrtimer is started */ void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { debug_init(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup); /** * hrtimer_setup_on_stack - initialize a timer on stack memory * @timer: The timer to be initialized * @function: the callback function * @clock_id: The clock to be used * @mode: The timer mode * * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack * memory. */ void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { debug_init_on_stack(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); /* * A timer is active, when it is enqueued into the rbtree or the * callback function is running or it's in the state of being migrated * to another cpu. * * It is important for this function to not return a false negative. */ bool hrtimer_active(const struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned int seq; do { base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); if (timer->state != HRTIMER_STATE_INACTIVE || base->running == timer) return true; } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); return false; } EXPORT_SYMBOL_GPL(hrtimer_active); /* * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 * distinct sections: * * - queued: the timer is queued * - callback: the timer is being ran * - post: the timer is inactive or (re)queued * * On the read side we ensure we observe timer->state and cpu_base->running * from the same section, if anything changed while we looked at it, we retry. * This includes timer->base changing because sequence numbers alone are * insufficient for that. * * The sequence numbers are required because otherwise we could still observe * a false negative if the read side got smeared over multiple consecutive * __run_hrtimer() invocations. */ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now, unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); bool expires_in_hardirq; int restart; lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); base->running = timer; /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); fn = timer->function; /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the * timer is restarted with a period then it becomes an absolute * timer. If its not restarted it does not matter. */ if (IS_ENABLED(CONFIG_TIME_LOW_RES)) timer->is_rel = false; /* * The timer is marked as running in the CPU base, so it is * protected against migration to a different CPU even if the lock * is dropped. */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); expires_in_hardirq = lockdep_hrtimer_enter(timer); restart = fn(timer); lockdep_hrtimer_exit(expires_in_hardirq); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); /* * Note: We clear the running state after enqueue_hrtimer and * we do not reprogram the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * * Note: Because we dropped the cpu_base->lock above, * hrtimer_start_range_ns() can have popped in and enqueued the timer * for us already. */ if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running.timer == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); WARN_ON_ONCE(base->running != timer); base->running = NULL; } static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, unsigned long flags, unsigned int active_mask) { struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases & active_mask; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *node; ktime_t basenow; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; timer = container_of(node, struct hrtimer, node); /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the * earliest interrupt after their soft expiration. * This allows us to avoid using a Priority Search * Tree, which can answer a stabbing query for * overlapping intervals and instead use the simple * BST we already have. * We don't add extra wakeups by delaying timers that * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ if (basenow < hrtimer_get_softexpires_tv64(timer)) break; __run_hrtimer(cpu_base, base, timer, &basenow, flags); if (active_mask == HRTIMER_ACTIVE_SOFT) hrtimer_sync_wait_running(cpu_base, flags); } } } static __latent_entropy void hrtimer_run_softirq(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; hrtimer_cpu_base_lock_expiry(cpu_base); raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); cpu_base->softirq_activated = 0; hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); hrtimer_cpu_base_unlock_expiry(cpu_base); } #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer interrupt * Called with interrupts disabled */ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; unsigned long flags; int retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: cpu_base->in_hrtirq = 1; /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via * the migration code. This does not affect enqueueing of * timers which run their callback and need to be requeued on * this CPU. */ cpu_base->expires_next = KTIME_MAX; if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); /* Reevaluate the clock bases for the [soft] next expiry */ expires_next = hrtimer_update_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; cpu_base->in_hrtirq = 0; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; return; } /* * The next timer was already expired due to: * - tracing * - long lasting callbacks * - being scheduled away when running in a VM * * We need to prevent that we loop forever in the hrtimer * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. * * Acquire base lock for updating the offsets and retrieving * the current time. */ raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) goto retry; /* * Give the system a chance to do something else than looping * here. We stored the entry time, so we know exactly how long * we spent here. We schedule the next event this amount of * time away. */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); delta = ktime_sub(now, entry_time); if ((unsigned int)delta > cpu_base->max_hang_time) cpu_base->max_hang_time = (unsigned int) delta; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. */ if (delta > 100 * NSEC_PER_MSEC) expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); else expires_next = ktime_add(now, delta); tick_program_event(expires_next, 1); pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); } #endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from run_local_timers in hardirq context every jiffy */ void hrtimer_run_queues(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; if (hrtimer_hres_active(cpu_base)) return; /* * This _is_ ugly: We have to check periodically, whether we * can switch to highres and / or nohz mode. The clocksource * switch happens with xtime_lock held. Notification from * there only sets the check bit in the tick_oneshot code, * otherwise we might deadlock vs. xtime_lock. */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { hrtimer_switch_to_hres(); return; } raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } /* * Sleep related functions: */ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) { struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); struct task_struct *task = t->task; t->task = NULL; if (task) wake_up_process(task); return HRTIMER_NORESTART; } /** * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer * @sl: sleeper to be started * @mode: timer mode abs/rel * * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) */ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode) { /* * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because * __hrtimer_init_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) mode |= HRTIMER_MODE_HARD; hrtimer_start_expires(&sl->timer, mode); } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context either for latency reasons or because the * hrtimer callback takes regular spinlocks or invokes other * functions which are not suitable for hard interrupt context on * PREEMPT_RT. * * The hrtimer_sleeper callback is RT compatible in hard interrupt * context, but there is a latency concern: Untrusted userspace can * spawn many threads which arm timers for the same expiry time on * the same CPU. That causes a latency spike due to the wakeup of * a gazillion threads. * * OTOH, privileged real-time user space applications rely on the * low latency of hard interrupt wakeups. If the current task is in * a real-time scheduling class, mark the mode for hard interrupt * expiry. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT)) mode |= HRTIMER_MODE_HARD; } __hrtimer_init(&sl->timer, clock_id, mode); sl->timer.function = hrtimer_wakeup; sl->task = current; } /** * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory * @sl: sleeper to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel */ void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { debug_init_on_stack(&sl->timer, clock_id, mode); __hrtimer_init_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif case TT_NATIVE: if (put_timespec64(ts, restart->nanosleep.rmtp)) return -EFAULT; break; default: BUG(); } return -ERESTART_RESTARTBLOCK; } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { struct restart_block *restart; do { set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; } while (t->task && !signal_pending(current)); __set_current_state(TASK_RUNNING); if (!t->task) return 0; restart = &current->restart_block; if (restart->nanosleep.type != TT_NONE) { ktime_t rem = hrtimer_expires_remaining(&t->timer); struct timespec64 rmt; if (rem <= 0) return 0; rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } return -ERESTART_RESTARTBLOCK; } static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; int ret; hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; } long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; hrtimer_setup_sleeper_on_stack(&t, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; /* Absolute timers do not update the rmtp value and restart: */ if (mode == HRTIMER_MODE_ABS) { ret = -ERESTARTNOHAND; goto out; } restart = &current->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); return ret; } #ifdef CONFIG_64BIT SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) { struct timespec64 tu; if (get_timespec64(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, struct old_timespec32 __user *, rmtp) { struct timespec64 tu; if (get_old_timespec32(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif /* * Functions related to boot-time initialization: */ int hrtimers_prepare_cpu(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; cpu_base->active_bases = 0; cpu_base->hres_active = 0; cpu_base->hang_detected = 0; cpu_base->next_timer = NULL; cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->online = 1; hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } #ifdef CONFIG_HOTPLUG_CPU static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct timerqueue_node *node; while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_deactivate(timer); /* * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not * reprogram the event device in case the timer * expires before the earliest on this CPU, but we run * hrtimer_interrupt after we migrated everything to * sort out already expired timers and reprogram the * event device. */ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); } } int hrtimers_cpu_dying(unsigned int dying_cpu) { int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); struct hrtimer_cpu_base *old_base, *new_base; old_base = this_cpu_ptr(&hrtimer_bases); new_base = &per_cpu(hrtimer_bases, ncpu); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ raw_spin_lock(&old_base->lock); raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } /* * The migration might have changed the first expiring softirq * timer on this CPU. Update it. */ __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); old_base->online = 0; raw_spin_unlock(&old_base->lock); return 0; } #endif /* CONFIG_HOTPLUG_CPU */ void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); }
7337 7337 7104 243 51 8 7104 63 4 7138 73 4324 15 2 5 10 2315 2311 7 1 2319 4370 227 68 7318 64 9 5 8 3 47 8 2670 4125 100 126 27 8121 8136 371 489 311 95 316 307 7963 994 3362 8097 4225 8131 8148 30 4188 8142 209 8122 2080 4146 4281 4543 7338 7348 7370 5629 23 7323 7318 26 26 26 184 12 7324 80 7321 7324 7282 6821 788 150 2 20 21 7257 11 7247 1 6870 1 7208 19 7207 7193 12 184 1 9 175 5058 6517 1619 18 5454 11 6539 9 5741 2015 3 1 2870 2758 2764 2759 160 2745 2 2744 2 22 2743 137 150 1 1 149 17 369 447 258 75 1 73 724 12 1 1 1 1 57 49 52 91 1 97 15 93 8 8 8 1 7 2 1 2 184 161 176 6 2 22 178 22 180 22 2 187 185 14 19 2 18 14 1 13 4347 1 3192 7 3 3197 2177 4742 1 7 1 164 170 821 1 8 2407 510 110 31 1476 1475 3 1223 89 6086 2488 4713 247 2 5 169 362 328 372 23 2588 6 2 24 5 2568 2487 135 1476 7246 4 1 12 31 510 1 26 510 510 7 31 509 8 246 322 3 215 196 249 267 148 344 3 148 475 161 5604 5626 821 5335 364 2 4592 1544 164 363 467 243 131 52 93 120 120 3 15 80 67 1 7 71 4 4 5833 5806 85 138 131 80 186 184 1 3 4758 582 16 5410 8 4623 1469 4055 358 16526 17229 3 71 71 7293 7349 1139 2647 126 7250 120 7293 192 5755 6025 1212 7085 218 7175 324 6364 217 350 6327 7 5 7346 12 7297 128 7330 7339 23 12 11 2631 6189 6195 6189 58 58 320 4 322 281 42 6413 1 2 23 3 4468 81 85 79 9 4556 4551 8 715 4495 4222 78 4209 960 4613 129 4521 4499 12 4507 4 637 4242 4520 2 86 2 1489 1563 41 1535 1529 87 2 88 88 1487 1535 2259 2263 8 8 1263 1268 1259 1245 962 960 6 958 177 178 4 174 354 11 343 63 35 222 189 188 15 14 1 14 3621 3619 11 1 692 4 695 4 4 687 5 279 446 685 1044 1040 1040 230 141 59 86 56 116 133 301 118 133 297 230 1471 170 1 1 167 31 146 19 5 14 1647 10 1643 3462 2119 15 1037 2086 33 1295 49 3425 15 3428 1 1 118 16 24 729 17 8 2 7 7 51 1458 1468 19 5 33 650 826 1085 419 650 825 81 759 17 16 1 21 1442 1345 123 3 809 591 3 584 1279 2 2 942 2751 35 3400 150 15 7 128 5 3563 7 3497 775 656 776 74 341 1171 835 656 1481 278 797 517 1268 2 835 655 425 1097 280 796 516 3001 46 2913 325 517 4 3406 3018 2757 768 17 737 3385 516 2874 131 89 3315 3072 120 3343 3249 124 126 1 4 121 11 112 13 2 108 101 95 5 32 6 1 13 15 36 27 13 3748 31 35 799 3653 3417 1786 3182 3733 3738 3687 30 3701 1 3708 16 16 15 15 15 15 14 922 59 1 70 52 778 50 90 40 40 41 11 11 115 2 112 7 105 105 34 16 94 185 3 187 188 37 155 2 153 29 65 153 186 73 115 472 4 1 469 467 111 389 489 124 401 1 400 399 486 340 153 116 5 1 111 111 38 71 40 71 131 11 1 1 4 19 1 110 125 131 110 240 5 1 236 235 9 19 221 220 182 14 1 1 4 28 1 160 37 162 177 178 183 3 2 77 1 22 53 132 137 2 1 36 106 151 4 19 1 134 133 150 150 66 84 108 1 1 1 1 105 85 31 97 3 34 5 1 1 5 70 75 74 98 98 31 67 439 9 4 270 140 54 10 1 316 63 61 105 18 420 101 60 95 69 5 9 259 417 421 406 14 14 153 271 329 107 121 239 65 25 90 1 274 42 418 279 173 124 275 42 436 3 3 34 11 1 1 1 2 294 114 5 56 5 385 313 5 34 39 159 1 229 1 1 21 359 401 397 405 429 431 187 12 241 5 16 22 3 8 14 3 9 3 11 11 8 8 39 7 7 1 37 31 31 31 16 1 5 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * Some corrections by tytso. */ /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname * lookup logic. */ /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. */ #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/wordpart.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/audit.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/fcntl.h> #include <linux/device_cgroup.h> #include <linux/fs_struct.h> #include <linux/posix_acl.h> #include <linux/hash.h> #include <linux/bitops.h> #include <linux/init_task.h> #include <linux/uaccess.h> #include "internal.h" #include "mount.h" /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) * were necessary because of omirr. The reason is that omirr needs * to know the _real_ pathname, not the user-supplied one, in case * of symlinks (and also when transname replacements occur). * * The new code replaces the old recursive symlink resolution with * an iterative one (in case of non-nested symlink chains). It does * this with calls to <fs>_follow_link(). * As a side effect, dir_namei(), _namei() and follow_link() are now * replaced with a single function lookup_dentry() that can handle all * the special cases of the former code. * * With the new dcache, the pathname is stored at each inode, at least as * long as the refcount of the inode is positive. As a side effect, the * size of the dcache depends on the inode cache and thus is dynamic. * * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink * resolution to correspond with current state of the code. * * Note that the symlink resolution is not *completely* iterative. * There is still a significant amount of tail- and mid- recursion in * the algorithm. Also, note that <fs>_readlink() is not used in * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink() * may return different results than <fs>_follow_link(). Many virtual * filesystems (including /proc) exhibit this behavior. */ /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL * and the name already exists in form of a symlink, try to create the new * name indicated by the symlink. The old code always complained that the * name already exists, due to not following the symlink even if its target * is nonexistent. The new semantics affects also mknod() and link() when * the name is a symlink pointing to a non-existent name. * * I don't know which semantics is the right one, since I have no access * to standards. But I found by trial that HP-UX 9.0 has the full "new" * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the * "old" one. Personally, I think the new semantics is much more logical. * Note that "ln old new" where "new" is a symlink pointing to a non-existing * file does succeed in both HP-UX and SunOs, but not in Solaris * and in the old Linux semantics. */ /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink * semantics. See the comments in "open_namei" and "do_link" below. * * [10-Sep-98 Alan Modra] Another symlink change. */ /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: * inside the path - always follow. * in the last component in creation/removal/renaming - never follow. * if LOOKUP_FOLLOW passed - follow. * if the pathname has trailing slashes - follow. * otherwise - don't follow. * (applied in that order). * * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT * restored for 2.4. This is the last surviving part of old 4.2BSD bug. * During the 2.4 we need to fix the userland stuff depending on it - * hopefully we will be able to get rid of that wart in 2.5. So far only * XEmacs seems to be relying on it... */ /* * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives * any extra contention... */ /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. * * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) struct filename * getname_flags(const char __user *filename, int flags) { struct filename *result; char *kname; int len; result = audit_reusename(filename); if (result) return result; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); /* * First, try to embed the struct filename inside the names_cache * allocation */ kname = (char *)result->iname; result->name = kname; len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); /* * Handle both empty path and copy failure in one go. */ if (unlikely(len <= 0)) { if (unlikely(len < 0)) { __putname(result); return ERR_PTR(len); } /* The empty path is special. */ if (!(flags & LOOKUP_EMPTY)) { __putname(result); return ERR_PTR(-ENOENT); } } /* * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a * separate struct filename so we can dedicate the entire * names_cache allocation for the pathname, and re-do the copy from * userland. */ if (unlikely(len == EMBEDDED_NAME_MAX)) { const size_t size = offsetof(struct filename, iname[1]); kname = (char *)result; /* * size is chosen that way we to guarantee that * result->iname[0] is within the same object and that * kname can't be equal to result->iname, no matter what. */ result = kzalloc(size, GFP_KERNEL); if (unlikely(!result)) { __putname(kname); return ERR_PTR(-ENOMEM); } result->name = kname; len = strncpy_from_user(kname, filename, PATH_MAX); if (unlikely(len < 0)) { __putname(kname); kfree(result); return ERR_PTR(len); } /* The empty path is special. */ if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) { __putname(kname); kfree(result); return ERR_PTR(-ENOENT); } if (unlikely(len == PATH_MAX)) { __putname(kname); kfree(result); return ERR_PTR(-ENAMETOOLONG); } } atomic_set(&result->refcnt, 1); result->uptr = filename; result->aname = NULL; audit_getname(result); return result; } struct filename *getname_uflags(const char __user *filename, int uflags) { int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; return getname_flags(filename, flags); } struct filename *getname(const char __user * filename) { return getname_flags(filename, 0); } struct filename *__getname_maybe_null(const char __user *pathname) { struct filename *name; char c; /* try to save on allocations; loss on um, though */ if (get_user(c, pathname)) return ERR_PTR(-EFAULT); if (!c) return NULL; name = getname_flags(pathname, LOOKUP_EMPTY); if (!IS_ERR(name) && !(name->name[0])) { putname(name); name = NULL; } return name; } struct filename *getname_kernel(const char * filename) { struct filename *result; int len = strlen(filename) + 1; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); if (len <= EMBEDDED_NAME_MAX) { result->name = (char *)result->iname; } else if (len <= PATH_MAX) { const size_t size = offsetof(struct filename, iname[1]); struct filename *tmp; tmp = kmalloc(size, GFP_KERNEL); if (unlikely(!tmp)) { __putname(result); return ERR_PTR(-ENOMEM); } tmp->name = (char *)result; result = tmp; } else { __putname(result); return ERR_PTR(-ENAMETOOLONG); } memcpy((char *)result->name, filename, len); result->uptr = NULL; result->aname = NULL; atomic_set(&result->refcnt, 1); audit_getname(result); return result; } EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { if (IS_ERR_OR_NULL(name)) return; if (WARN_ON_ONCE(!atomic_read(&name->refcnt))) return; if (!atomic_dec_and_test(&name->refcnt)) return; if (name->name != name->iname) { __putname(name->name); kfree(name); } else __putname(name); } EXPORT_SYMBOL(putname); /** * check_acl - perform ACL permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * This function performs the ACL permission checking. Since this function * retrieve POSIX acls it needs to know whether it is called from a blocking or * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *acl; if (mask & MAY_NOT_BLOCK) { acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); if (!acl) return -EAGAIN; /* no ->get_inode_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; return posix_acl_permission(idmap, inode, acl, mask); } acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { int error = posix_acl_permission(idmap, inode, acl, mask); posix_acl_release(acl); return error; } #endif return -EAGAIN; } /* * Very quick optimistic "we know we have no ACL's" check. * * Note that this is purely for ACL_TYPE_ACCESS, and purely * for the "we have cached that there are no ACLs" case. * * If this returns true, we know there are no ACLs. But if * it returns false, we might still not have ACLs (it could * be the is_uncached_acl() case). */ static inline bool no_acl_inode(struct inode *inode) { #ifdef CONFIG_FS_POSIX_ACL return likely(!READ_ONCE(inode->i_acl)); #else return true; #endif } /** * acl_permission_check - perform basic UNIX permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * This function performs the basic UNIX permission checking. Since this * function may retrieve POSIX acls it needs to know whether it is called from a * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) { unsigned int mode = inode->i_mode; vfsuid_t vfsuid; /* * Common cheap case: everybody has the requested * rights, and there are no ACLs to check. No need * to do any owner/group checks in that case. * * - 'mask&7' is the requested permission bit set * - multiplying by 0111 spreads them out to all of ugo * - '& ~mode' looks for missing inode permission bits * - the '!' is for "no missing permissions" * * After that, we just need to check that there are no * ACL's on the inode - do the 'IS_POSIXACL()' check last * because it will dereference the ->i_sb pointer and we * want to avoid that if at all possible. */ if (!((mask & 7) * 0111 & ~mode)) { if (no_acl_inode(inode)) return 0; if (!IS_POSIXACL(inode)) return 0; } /* Are we the owner? If so, ACL's don't matter */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { mask &= 7; mode >>= 6; return (mask & ~mode) ? -EACCES : 0; } /* Do we have ACL's? */ if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { int error = check_acl(idmap, inode, mask); if (error != -EAGAIN) return error; } /* Only RWX matters for group/other mode bits */ mask &= 7; /* * Are the group permissions different from * the other permissions in the bits we care * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) mode >>= 3; } /* Bits in 'mode' clear that we require? */ return (mask & ~mode) ? -EACCES : 0; } /** * generic_permission - check for access rights on a Posix-like filesystem * @idmap: idmap of the mount the inode was found from * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, * %MAY_NOT_BLOCK ...) * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions * for filesystem access without changing the "normal" uids which * are used for other things. * * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; /* * Do the basic permission checks. */ ret = acl_permission_check(idmap, inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ if (!(mask & MAY_WRITE)) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } /* * Searching includes executable on directories, else just read. */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; /* * Read/write DACs are always overridable. * Executable DACs are overridable when there is * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } EXPORT_SYMBOL(generic_permission); /** * do_inode_permission - UNIX permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * We _really_ want to just do "generic_permission()" without * even looking at the inode->i_op values. So we keep a cache * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ static inline int do_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) return inode->i_op->permission(idmap, inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } return generic_permission(idmap, inode, mask); } /** * sb_permission - Check superblock-level permissions * @sb: Superblock of inode to check permission on * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { if (unlikely(mask & MAY_WRITE)) { umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) return -EROFS; } return 0; } /** * inode_permission - Check for access rights to a given inode * @idmap: idmap of the mount the inode was found from * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Check for read/write/execute permissions on an inode. We use fs[ug]id for * this, letting us set arbitrary permissions for filesystem access without * changing the "normal" UIDs which are used for other things. * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ int inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int retval; retval = sb_permission(inode->i_sb, inode, mask); if (retval) return retval; if (unlikely(mask & MAY_WRITE)) { /* * Nobody gets write access to an immutable file. */ if (IS_IMMUTABLE(inode)) return -EPERM; /* * Updating mtime will likely cause i_uid and i_gid to be * written back improperly if their true value is unknown * to the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) return -EACCES; } retval = do_inode_permission(idmap, inode, mask); if (retval) return retval; retval = devcgroup_inode_permission(inode, mask); if (retval) return retval; return security_inode_permission(inode, mask); } EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path * @path: path to get the reference to * * Given a path increment the reference count to the dentry and the vfsmount. */ void path_get(const struct path *path) { mntget(path->mnt); dget(path->dentry); } EXPORT_SYMBOL(path_get); /** * path_put - put a reference to a path * @path: path to put the reference to * * Given a path decrement the reference count to the dentry and the vfsmount. */ void path_put(const struct path *path) { dput(path->dentry); mntput(path->mnt); } EXPORT_SYMBOL(path_put); #define EMBEDDED_LEVELS 2 struct nameidata { struct path path; struct qstr last; struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags, state; unsigned seq, next_seq, m_seq, r_seq; int last_type; unsigned depth; int total_link_count; struct saved { struct path link; struct delayed_call done; const char *name; unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; const char *pathname; struct nameidata *saved; unsigned root_seq; int dfd; vfsuid_t dir_vfsuid; umode_t dir_mode; } __randomize_layout; #define ND_ROOT_PRESET 1 #define ND_ROOT_GRABBED 2 #define ND_JUMPED 4 static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name) { struct nameidata *old = current->nameidata; p->stack = p->internal; p->depth = 0; p->dfd = dfd; p->name = name; p->pathname = likely(name) ? name->name : ""; p->path.mnt = NULL; p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; current->nameidata = p; } static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name, const struct path *root) { __set_nameidata(p, dfd, name); p->state = 0; if (unlikely(root)) { p->state = ND_ROOT_PRESET; p->root = *root; } } static void restore_nameidata(void) { struct nameidata *now = current->nameidata, *old = now->saved; current->nameidata = old; if (old) old->total_link_count = now->total_link_count; if (now->stack != now->internal) kfree(now->stack); } static bool nd_alloc_stack(struct nameidata *nd) { struct saved *p; p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL); if (unlikely(!p)) return false; memcpy(p, nd->internal, sizeof(nd->internal)); nd->stack = p; return true; } /** * path_connected - Verify that a dentry is below mnt.mnt_root * @mnt: The mountpoint to check. * @dentry: The dentry to check. * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. */ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) { struct super_block *sb = mnt->mnt_sb; /* Bind mounts can have disconnected paths */ if (mnt->mnt_root == sb->s_root) return true; return is_subdir(dentry, mnt->mnt_root); } static void drop_links(struct nameidata *nd) { int i = nd->depth; while (i--) { struct saved *last = nd->stack + i; do_delayed_call(&last->done); clear_delayed_call(&last->done); } } static void leave_rcu(struct nameidata *nd) { nd->flags &= ~LOOKUP_RCU; nd->seq = nd->next_seq = 0; rcu_read_unlock(); } static void terminate_walk(struct nameidata *nd) { drop_links(nd); if (!(nd->flags & LOOKUP_RCU)) { int i; path_put(&nd->path); for (i = 0; i < nd->depth; i++) path_put(&nd->stack[i].link); if (nd->state & ND_ROOT_GRABBED) { path_put(&nd->root); nd->state &= ~ND_ROOT_GRABBED; } } else { leave_rcu(nd); } nd->depth = 0; nd->path.mnt = NULL; nd->path.dentry = NULL; } /* path_put is needed afterwards regardless of success or failure */ static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq) { int res = __legitimize_mnt(path->mnt, mseq); if (unlikely(res)) { if (res > 0) path->mnt = NULL; path->dentry = NULL; return false; } if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) { path->dentry = NULL; return false; } return !read_seqcount_retry(&path->dentry->d_seq, seq); } static inline bool legitimize_path(struct nameidata *nd, struct path *path, unsigned seq) { return __legitimize_path(path, seq, nd->m_seq); } static bool legitimize_links(struct nameidata *nd) { int i; if (unlikely(nd->flags & LOOKUP_CACHED)) { drop_links(nd); nd->depth = 0; return false; } for (i = 0; i < nd->depth; i++) { struct saved *last = nd->stack + i; if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { drop_links(nd); nd->depth = i + 1; return false; } } return true; } static bool legitimize_root(struct nameidata *nd) { /* Nothing to do if nd->root is zero or is managed by the VFS user. */ if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) return true; nd->state |= ND_ROOT_GRABBED; return legitimize_path(nd, &nd->root, nd->root_seq); } /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab * normal reference counts on dentries and vfsmounts to transition to ref-walk * mode. Refcounts are grabbed at the last known good point before rcu-walk * got stuck, so ref-walk may continue from there. If this is not successful * (eg. a seqcount has changed), then failure is returned and it's up to caller * to restart the path walk from the beginning in ref-walk mode. */ /** * try_to_unlazy - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * Returns: true on success, false on failure * * try_to_unlazy attempts to legitimize the current nd->path and nd->root * for ref-walk mode. * Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy() failure and * terminate_walk(). */ static bool try_to_unlazy(struct nameidata *nd) { struct dentry *parent = nd->path.dentry; BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(!legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; if (unlikely(!legitimize_root(nd))) goto out; leave_rcu(nd); BUG_ON(nd->inode != parent->d_inode); return true; out1: nd->path.mnt = NULL; nd->path.dentry = NULL; out: leave_rcu(nd); return false; } /** * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: next dentry to step into * Returns: true on success, false on failure * * Similar to try_to_unlazy(), but here we have the next dentry already * picked by rcu-walk and want to legitimize that in addition to the current * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy_next() failure and * terminate_walk(). */ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(!legitimize_links(nd))) goto out2; res = __legitimize_mnt(nd->path.mnt, nd->m_seq); if (unlikely(res)) { if (res > 0) goto out2; goto out1; } if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) goto out1; /* * We need to move both the parent and the dentry from the RCU domain * to be properly refcounted. And the sequence number in the dentry * validates *both* dentry counters, since we checked the sequence * number of the parent after we got the child sequence number. So we * know the parent must still be valid if the child sequence number is */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) goto out_dput; /* * Sequence counts matched. Now make sure that the root is * still valid and get it if required. */ if (unlikely(!legitimize_root(nd))) goto out_dput; leave_rcu(nd); return true; out2: nd->path.mnt = NULL; out1: nd->path.dentry = NULL; out: leave_rcu(nd); return false; out_dput: leave_rcu(nd); dput(dentry); return false; } static inline int d_revalidate(struct dentry *dentry, unsigned int flags) { if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) return dentry->d_op->d_revalidate(dentry, flags); else return 1; } /** * complete_walk - successful completion of path walk * @nd: pointer nameidata * * If we had been in RCU mode, drop out of it and legitimize nd->path. * Revalidate the final result, unless we'd already done that during * the path walk or the filesystem doesn't ask for it. Return 0 on * success, -error on failure. In case of failure caller does not * need to drop nd->path. */ static int complete_walk(struct nameidata *nd) { struct dentry *dentry = nd->path.dentry; int status; if (nd->flags & LOOKUP_RCU) { /* * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ if (!(nd->state & ND_ROOT_PRESET)) if (!(nd->flags & LOOKUP_IS_SCOPED)) nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) return -ECHILD; } if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { /* * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't * ever step outside the root during lookup" and should already * be guaranteed by the rest of namei, we want to avoid a namei * BUG resulting in userspace being given a path that was not * scoped within the root at some point during the lookup. * * So, do a final sanity-check to make sure that in the * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED) * we won't silently return an fd completely outside of the * requested root to userspace. * * Userspace could move the path outside the root after this * check, but as discussed elsewhere this is not a concern (the * resolved file was inside the root at some point). */ if (!path_is_under(&nd->path, &nd->root)) return -EXDEV; } if (likely(!(nd->state & ND_JUMPED))) return 0; if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) return 0; status = dentry->d_op->d_weak_revalidate(dentry, nd->flags); if (status > 0) return 0; if (!status) status = -ESTALE; return status; } static int set_root(struct nameidata *nd) { struct fs_struct *fs = current->fs; /* * Jumping to the real root in a scoped-lookup is a BUG in namei, but we * still have to ensure it doesn't happen because it will cause a breakout * from the dirfd. */ if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED)) return -ENOTRECOVERABLE; if (nd->flags & LOOKUP_RCU) { unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); nd->state |= ND_ROOT_GRABBED; } return 0; } static int nd_jump_root(struct nameidata *nd) { if (unlikely(nd->flags & LOOKUP_BENEATH)) return -EXDEV; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { /* Absolute path arguments to path_init() are allowed. */ if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt) return -EXDEV; } if (!nd->root.mnt) { int error = set_root(nd); if (error) return error; } if (nd->flags & LOOKUP_RCU) { struct dentry *d; nd->path = nd->root; d = nd->path.dentry; nd->inode = d->d_inode; nd->seq = nd->root_seq; if (read_seqcount_retry(&d->d_seq, nd->seq)) return -ECHILD; } else { path_put(&nd->path); nd->path = nd->root; path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } nd->state |= ND_JUMPED; return 0; } /* * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ int nd_jump_link(const struct path *path) { int error = -ELOOP; struct nameidata *nd = current->nameidata; if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS)) goto err; error = -EXDEV; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { if (nd->path.mnt != path->mnt) goto err; } /* Not currently safe for scoped-lookups. */ if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) goto err; path_put(&nd->path); nd->path = *path; nd->inode = nd->path.dentry->d_inode; nd->state |= ND_JUMPED; return 0; err: path_put(path); return error; } static inline void put_link(struct nameidata *nd) { struct saved *last = nd->stack + --nd->depth; do_delayed_call(&last->done); if (!(nd->flags & LOOKUP_RCU)) path_put(&last->link); } static int sysctl_protected_symlinks __read_mostly; static int sysctl_protected_hardlinks __read_mostly; static int sysctl_protected_fifos __read_mostly; static int sysctl_protected_regular __read_mostly; #ifdef CONFIG_SYSCTL static struct ctl_table namei_sysctls[] = { { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, }; static int __init init_fs_namei_sysctls(void) { register_sysctl_init("fs", namei_sysctls); return 0; } fs_initcall(init_fs_namei_sysctls); #endif /* CONFIG_SYSCTL */ /** * may_follow_link - Check symlink following for unsafe situations * @nd: nameidata pathwalk data * @inode: Used for idmapping. * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is * in a sticky world-writable directory. This is to protect privileged * processes from failing races against path names that may change out * from under them by way of other users creating malicious symlinks. * It will permit symlinks to be followed only when outside a sticky * world-writable directory, or when the uid of the symlink and follower * match, or when the directory owner matches the symlink's owner. * * Returns 0 if following the symlink is allowed, -ve on error. */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { struct mnt_idmap *idmap; vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; idmap = mnt_idmap(nd->path.mnt); vfsuid = i_uid_into_vfsuid(idmap, inode); /* Allowed if owner and follower match. */ if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; /* Allowed if parent directory not sticky and world-writable. */ if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; /* Allowed if parent directory and link owner match. */ if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid)) return 0; if (nd->flags & LOOKUP_RCU) return -ECHILD; audit_inode(nd->name, nd->stack[0].link.dentry, 0); audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link"); return -EACCES; } /** * safe_hardlink_source - Check for safe hardlink conditions * @idmap: idmap of the mount the inode was found from * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: * - inode is not a regular file * - inode is setuid * - inode is setgid and group-exec * - access failure for read and write * * Otherwise returns true. */ static bool safe_hardlink_source(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; /* Special files should not get pinned to the filesystem. */ if (!S_ISREG(mode)) return false; /* Setuid files should not get pinned to the filesystem. */ if (mode & S_ISUID) return false; /* Executable setgid files should not get pinned to the filesystem. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE)) return false; return true; } /** * may_linkat - Check permissions for creating a hardlink * @idmap: idmap of the mount the inode was found from * @link: the source to hardlink from * * Block hardlink when all of: * - sysctl_protected_hardlinks enabled * - fsuid does not match inode * - hardlink source is unsafe (see safe_hardlink_source() above) * - not CAP_FOWNER in a namespace with the inode owner uid mapped * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if successful, -ve on error. */ int may_linkat(struct mnt_idmap *idmap, const struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) return 0; /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ if (safe_hardlink_source(idmap, inode) || inode_owner_or_capable(idmap, inode)) return 0; audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); return -EPERM; } /** * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. * @idmap: idmap of the mount the inode was found from * @nd: nameidata pathwalk data * @inode: the inode of the file to open * * Block an O_CREAT open of a FIFO (or a regular file) when: * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled * - the file already exists * - we are in a sticky directory * - we don't own the file * - the owner of the directory doesn't own the file * - the directory is world writable * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2 * the directory doesn't have to be world writable: being group writable will * be enough. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if the open is allowed, -ve on error. */ static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, struct inode *const inode) { umode_t dir_mode = nd->dir_mode; vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid; if (likely(!(dir_mode & S_ISVTX))) return 0; if (S_ISREG(inode->i_mode) && !sysctl_protected_regular) return 0; if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos) return 0; i_vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq(i_vfsuid, dir_vfsuid)) return 0; if (vfsuid_eq_kuid(i_vfsuid, current_fsuid())) return 0; if (likely(dir_mode & 0002)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create"); return -EACCES; } if (dir_mode & 0020) { if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create_fifo"); return -EACCES; } if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create_regular"); return -EACCES; } } return 0; } /* * follow_up - Find the mountpoint of path's vfsmount * * Given a path, find the mountpoint of its source file system. * Replace @path with the path of the mountpoint in the parent mount. * Up is towards /. * * Return 1 if we went up a level and 0 if we were already at the * root. */ int follow_up(struct path *path) { struct mount *mnt = real_mount(path->mnt); struct mount *parent; struct dentry *mountpoint; read_seqlock_excl(&mount_lock); parent = mnt->mnt_parent; if (parent == mnt) { read_sequnlock_excl(&mount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); read_sequnlock_excl(&mount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); path->mnt = &parent->mnt; return 1; } EXPORT_SYMBOL(follow_up); static bool choose_mountpoint_rcu(struct mount *m, const struct path *root, struct path *path, unsigned *seqp) { while (mnt_has_parent(m)) { struct dentry *mountpoint = m->mnt_mountpoint; m = m->mnt_parent; if (unlikely(root->dentry == mountpoint && root->mnt == &m->mnt)) break; if (mountpoint != m->mnt.mnt_root) { path->mnt = &m->mnt; path->dentry = mountpoint; *seqp = read_seqcount_begin(&mountpoint->d_seq); return true; } } return false; } static bool choose_mountpoint(struct mount *m, const struct path *root, struct path *path) { bool found; rcu_read_lock(); while (1) { unsigned seq, mseq = read_seqbegin(&mount_lock); found = choose_mountpoint_rcu(m, root, path, &seq); if (unlikely(!found)) { if (!read_seqretry(&mount_lock, mseq)) break; } else { if (likely(__legitimize_path(path, seq, mseq))) break; rcu_read_unlock(); path_put(path); rcu_read_lock(); } } rcu_read_unlock(); return found; } /* * Perform an automount * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ static int follow_automount(struct path *path, int *count, unsigned lookup_flags) { struct dentry *dentry = path->dentry; /* We don't want to mount if someone's just doing a stat - * unless they're stat'ing a directory and appended a '/' to * the name. * * We do, however, want to mount if someone wants to open or * create a file of any type under the mountpoint, wants to * traverse through the mountpoint or wants to open the * mounted directory. Also, autofs may mark negative dentries * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && dentry->d_inode) return -EISDIR; if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; return finish_automount(dentry->d_op->d_automount(path), path); } /* * mount traversal - out-of-line part. One note on ->d_flags accesses - * dentries are pinned but not locked here, so negative dentry can go * positive right under us. Use of smp_load_acquire() provides a barrier * sufficient for ->d_inode and ->d_flags consistency. */ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, int *count, unsigned lookup_flags) { struct vfsmount *mnt = path->mnt; bool need_mntput = false; int ret = 0; while (flags & DCACHE_MANAGED_DENTRY) { /* Allow the filesystem to manage the transit without i_mutex * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } if (flags & DCACHE_MOUNTED) { // something's mounted on it.. struct vfsmount *mounted = lookup_mnt(path); if (mounted) { // ... in our namespace dput(path->dentry); if (need_mntput) mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); // here we know it's positive flags = path->dentry->d_flags; need_mntput = true; continue; } } if (!(flags & DCACHE_NEED_AUTOMOUNT)) break; // uncovered automount point ret = follow_automount(path, count, lookup_flags); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } if (ret == -EISDIR) ret = 0; // possible if you race with several mount --move if (need_mntput && path->mnt == mnt) mntput(path->mnt); if (!ret && unlikely(d_flags_negative(flags))) ret = -ENOENT; *jumped = need_mntput; return ret; } static inline int traverse_mounts(struct path *path, bool *jumped, int *count, unsigned lookup_flags) { unsigned flags = smp_load_acquire(&path->dentry->d_flags); /* fastpath */ if (likely(!(flags & DCACHE_MANAGED_DENTRY))) { *jumped = false; if (unlikely(d_flags_negative(flags))) return -ENOENT; return 0; } return __traverse_mounts(path, flags, jumped, count, lookup_flags); } int follow_down_one(struct path *path) { struct vfsmount *mounted; mounted = lookup_mnt(path); if (mounted) { dput(path->dentry); mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); return 1; } return 0; } EXPORT_SYMBOL(follow_down_one); /* * Follow down to the covering mount currently visible to userspace. At each * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. */ int follow_down(struct path *path, unsigned int flags) { struct vfsmount *mnt = path->mnt; bool jumped; int ret = traverse_mounts(path, &jumped, NULL, flags); if (path->mnt != mnt) mntput(mnt); return ret; } EXPORT_SYMBOL(follow_down); /* * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. */ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) { struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; if (likely(!(flags & DCACHE_MANAGED_DENTRY))) return true; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return false; for (;;) { /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) { int res = dentry->d_op->d_manage(path, true); if (res) return res == -EISDIR; flags = dentry->d_flags; } if (flags & DCACHE_MOUNTED) { struct mount *mounted = __lookup_mnt(path->mnt, dentry); if (mounted) { path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; nd->state |= ND_JUMPED; nd->next_seq = read_seqcount_begin(&dentry->d_seq); flags = dentry->d_flags; // makes sure that non-RCU pathwalk could reach // this state. if (read_seqretry(&mount_lock, nd->m_seq)) return false; continue; } if (read_seqretry(&mount_lock, nd->m_seq)) return false; } return !(flags & DCACHE_NEED_AUTOMOUNT); } } static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, struct path *path) { bool jumped; int ret; path->mnt = nd->path.mnt; path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { unsigned int seq = nd->next_seq; if (likely(__follow_mount_rcu(nd, path))) return 0; // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; nd->next_seq = seq; if (!try_to_unlazy_next(nd, dentry)) return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); if (jumped) { if (unlikely(nd->flags & LOOKUP_NO_XDEV)) ret = -EXDEV; else nd->state |= ND_JUMPED; } if (unlikely(ret)) { dput(path->dentry); if (path->mnt != nd->path.mnt) mntput(path->mnt); } return ret; } /* * This looks up the name in dcache and possibly revalidates the found dentry. * NULL is returned if the dentry does not exist in the cache. */ static struct dentry *lookup_dcache(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry = d_lookup(dir, name); if (dentry) { int error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (!error) d_invalidate(dentry); dput(dentry); return ERR_PTR(error); } } return dentry; } /* * Parent directory has inode locked exclusive. This is one * and only case when ->lookup() gets called on non in-lookup * dentries - as the matter of fact, this only gets called * when directory is guaranteed to have no in-lookup children * at all. */ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags) { struct dentry *dentry = lookup_dcache(name, base, flags); struct dentry *old; struct inode *dir = base->d_inode; if (dentry) return dentry; /* Don't create child dentry for a dead directory. */ if (unlikely(IS_DEADDIR(dir))) return ERR_PTR(-ENOENT); dentry = d_alloc(base, name); if (unlikely(!dentry)) return ERR_PTR(-ENOMEM); old = dir->i_op->lookup(dir, dentry, flags); if (unlikely(old)) { dput(dentry); dentry = old; } return dentry; } EXPORT_SYMBOL(lookup_one_qstr_excl); /** * lookup_fast - do fast lockless (but racy) lookup of a dentry * @nd: current nameidata * * Do a fast, but racy lookup in the dcache for the given dentry, and * revalidate it. Returns a valid dentry pointer or NULL if one wasn't * found. On error, an ERR_PTR will be returned. * * If this function returns a valid dentry and the walk is no longer * lazy, the dentry will carry a reference that must later be put. If * RCU mode is still in force, then this is not the case and the dentry * must be legitimized before use. If this returns NULL, then the walk * will no longer be in RCU mode. */ static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; int status = 1; /* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, the caller is * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); if (unlikely(!dentry)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); return NULL; } /* * This sequence count validates that the parent had no * changes while we did the lookup of the dentry above. */ if (read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); status = d_revalidate(dentry, nd->flags); if (likely(status > 0)) return dentry; if (!try_to_unlazy_next(nd, dentry)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ status = d_revalidate(dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) return NULL; status = d_revalidate(dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) d_invalidate(dentry); dput(dentry); return ERR_PTR(status); } return dentry; } /* Fast lookup failed, do it the slow way */ static struct dentry *__lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry, *old; struct inode *inode = dir->d_inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) return ERR_PTR(-ENOENT); again: dentry = d_alloc_parallel(dir, name, &wq); if (IS_ERR(dentry)) return dentry; if (unlikely(!d_in_lookup(dentry))) { int error = d_revalidate(dentry, flags); if (unlikely(error <= 0)) { if (!error) { d_invalidate(dentry); dput(dentry); goto again; } dput(dentry); dentry = ERR_PTR(error); } } else { old = inode->i_op->lookup(inode, dentry, flags); d_lookup_done(dentry); if (unlikely(old)) { dput(dentry); dentry = old; } } return dentry; } static struct dentry *lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct inode *inode = dir->d_inode; struct dentry *res; inode_lock_shared(inode); res = __lookup_slow(name, dir, flags); inode_unlock_shared(inode); return res; } static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *restrict nd) { int err, mask; mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; err = inode_permission(idmap, nd->inode, mask | MAY_EXEC); if (likely(!err)) return 0; // If we failed, and we weren't in LOOKUP_RCU, it's final if (!(nd->flags & LOOKUP_RCU)) return err; // Drop out of RCU mode to make sure it wasn't transient if (!try_to_unlazy(nd)) return -ECHILD; // redo it all non-lazy if (err != -ECHILD) // hard error return err; return inode_permission(idmap, nd->inode, MAY_EXEC); } static int reserve_stack(struct nameidata *nd, struct path *link) { if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) return -ELOOP; if (likely(nd->depth != EMBEDDED_LEVELS)) return 0; if (likely(nd->stack != nd->internal)) return 0; if (likely(nd_alloc_stack(nd))) return 0; if (nd->flags & LOOKUP_RCU) { // we need to grab link before we do unlazy. And we can't skip // unlazy even if we fail to grab the link - cleanup needs it bool grabbed_link = legitimize_path(nd, link, nd->next_seq); if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; if (nd_alloc_stack(nd)) return 0; } return -ENOMEM; } enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; static const char *pick_link(struct nameidata *nd, struct path *link, struct inode *inode, int flags) { struct saved *last; const char *res; int error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) path_put(link); return ERR_PTR(error); } last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); last->seq = nd->next_seq; if (flags & WALK_TRAILING) { error = may_follow_link(nd, inode); if (unlikely(error)) return ERR_PTR(error); } if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) || unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) return ERR_PTR(-ELOOP); if (!(nd->flags & LOOKUP_RCU)) { touch_atime(&last->link); cond_resched(); } else if (atime_needs_update(&last->link, inode)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); touch_atime(&last->link); } error = security_inode_follow_link(link->dentry, inode, nd->flags & LOOKUP_RCU); if (unlikely(error)) return ERR_PTR(error); res = READ_ONCE(inode->i_link); if (!res) { const char * (*get)(struct dentry *, struct inode *, struct delayed_call *); get = inode->i_op->get_link; if (nd->flags & LOOKUP_RCU) { res = get(NULL, inode, &last->done); if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd)) res = get(link->dentry, inode, &last->done); } else { res = get(link->dentry, inode, &last->done); } if (!res) goto all_done; if (IS_ERR(res)) return res; } if (*res == '/') { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); while (unlikely(*++res == '/')) ; } if (*res) return res; all_done: // pure jump put_link(nd); return NULL; } /* * Do we need to follow links? We _really_ want to be able * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. * * NOTE: dentry must be what nd->next_seq had been sampled from. */ static const char *step_into(struct nameidata *nd, int flags, struct dentry *dentry) { struct path path; struct inode *inode; int err = handle_mounts(nd, dentry, &path); if (err < 0) return ERR_PTR(err); inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ if (nd->flags & LOOKUP_RCU) { if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); if (unlikely(!inode)) return ERR_PTR(-ENOENT); } else { dput(nd->path.dentry); if (nd->path.mnt != path.mnt) mntput(nd->path.mnt); } nd->path = path; nd->inode = inode; nd->seq = nd->next_seq; return NULL; } if (nd->flags & LOOKUP_RCU) { /* make sure that d_is_symlink above matches inode */ if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); } else { if (path.mnt == nd->path.mnt) mntget(path.mnt); } return pick_link(nd, &path, inode, flags); } static struct dentry *follow_dotdot_rcu(struct nameidata *nd) { struct dentry *parent, *old; if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { struct path path; unsigned seq; if (!choose_mountpoint_rcu(real_mount(nd->path.mnt), &nd->root, &path, &seq)) goto in_root; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-ECHILD); nd->path = path; nd->inode = path.dentry->d_inode; nd->seq = seq; // makes sure that non-RCU pathwalk could reach this state if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ } old = nd->path.dentry; parent = old->d_parent; nd->next_seq = read_seqcount_begin(&parent->d_seq); // makes sure that non-RCU pathwalk could reach this state if (read_seqcount_retry(&old->d_seq, nd->seq)) return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) return ERR_PTR(-ECHILD); return parent; in_root: if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); nd->next_seq = nd->seq; return nd->path.dentry; } static struct dentry *follow_dotdot(struct nameidata *nd) { struct dentry *parent; if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { struct path path; if (!choose_mountpoint(real_mount(nd->path.mnt), &nd->root, &path)) goto in_root; path_put(&nd->path); nd->path = path; nd->inode = path.dentry->d_inode; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-EXDEV); } /* rare case of legitimate dget_parent()... */ parent = dget_parent(nd->path.dentry); if (unlikely(!path_connected(nd->path.mnt, parent))) { dput(parent); return ERR_PTR(-ENOENT); } return parent; in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); return dget(nd->path.dentry); } static const char *handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { const char *error = NULL; struct dentry *parent; if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); if (error) return error; } if (nd->flags & LOOKUP_RCU) parent = follow_dotdot_rcu(nd); else parent = follow_dotdot(nd); if (IS_ERR(parent)) return ERR_CAST(parent); error = step_into(nd, WALK_NOFOLLOW, parent); if (unlikely(error)) return error; if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { /* * If there was a racing rename or mount along our * path, then we can't be sure that ".." hasn't jumped * above nd->root (and so userspace should retry or use * some fallback). */ smp_rmb(); if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)) return ERR_PTR(-EAGAIN); if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)) return ERR_PTR(-EAGAIN); } } return NULL; } static const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return handle_dots(nd, nd->last_type); } dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (unlikely(!dentry)) { dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); if (IS_ERR(dentry)) return ERR_CAST(dentry); } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return step_into(nd, flags, dentry); } /* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: * * - Architectures with fast unaligned word accesses. We could * do a "get_unaligned()" if this helps and is sufficiently * fast. * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. * * - Furthermore, we need an efficient 64-bit compile for the * 64-bit case in order to generate the "number of bytes in * the final mask". Again, that could be replaced with a * efficient population count instruction or similar. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> #ifdef HASH_MIX /* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */ #elif defined(CONFIG_64BIT) /* * Register pressure in the mixing function is an issue, particularly * on 32-bit x86, but almost any function requires one state value and * one temporary. Instead, use a function designed for two state values * and no temporaries. * * This function cannot create a collision in only two iterations, so * we have two iterations to achieve avalanche. In those two iterations, * we have six layers of mixing, which is enough to spread one bit's * influence out to 2^6 = 64 state bits. * * Rotate constants are scored by considering either 64 one-bit input * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the * probability of that delta causing a change to each of the 128 output * bits, using a sample of random initial states. * * The Shannon entropy of the computed probabilities is then summed * to produce a score. Ideally, any input change has a 50% chance of * toggling any given output bit. * * Mixing scores (in bits) for (12,45): * Input delta: 1-bit 2-bit * 1 round: 713.3 42542.6 * 2 rounds: 2753.7 140389.8 * 3 rounds: 5954.1 233458.2 * 4 rounds: 7862.6 256672.2 * Perfect: 8192 258048 * (64*128) (64*63/2 * 128) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol64(x,12),\ x += y, y = rol64(y,45),\ y *= 9 ) /* * Fold two longs into one 32-bit hash value. This must be fast, but * latency isn't quite as critical, as there is a fair bit of additional * work done before the hash value is used. */ static inline unsigned int fold_hash(unsigned long x, unsigned long y) { y ^= x * GOLDEN_RATIO_64; y *= GOLDEN_RATIO_64; return y >> 32; } #else /* 32-bit case */ /* * Mixing scores (in bits) for (7,20): * Input delta: 1-bit 2-bit * 1 round: 330.3 9201.6 * 2 rounds: 1246.4 25475.4 * 3 rounds: 1907.1 31295.1 * 4 rounds: 2042.3 31718.6 * Perfect: 2048 31744 * (32*64) (32*31/2 * 64) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol32(x, 7),\ x += y, y = rol32(y,20),\ y *= 9 ) static inline unsigned int fold_hash(unsigned long x, unsigned long y) { /* Use arch-optimized multiply if one exists */ return __hash_32(y ^ __hash_32(x)); } #endif /* * Return the hash of a string of known length. This is carfully * designed to match hash_name(), which is the more critical function. * In particular, we must end by hashing a final word containing 0..7 * payload bytes, to match the way that hash_name() iterates until it * finds the delimiter after the name. */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long a, x = 0, y = (unsigned long)salt; for (;;) { if (!len) goto done; a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; HASH_MIX(x, y, a); name += sizeof(unsigned long); len -= sizeof(unsigned long); } x ^= a & bytemask_from_count(len); done: return fold_hash(x, y); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long a = 0, x = 0, y = (unsigned long)salt; unsigned long adata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; len = 0; goto inside; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); inside: a = load_unaligned_zeropad(name+len); } while (!has_zero(a, &adata, &constants)); adata = prep_zero_mask(a, adata, &constants); mask = create_zero_mask(adata); x ^= a & zero_bytemask(mask); return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } EXPORT_SYMBOL(hashlen_string); /* * Calculate the length and hash of the path component, and * return the length as the result. */ static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { unsigned long a, b, x, y = (unsigned long)nd->path.dentry; unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; /* * The first iteration is special, because it can result in * '.' and '..' and has no mixing other than the final fold. */ a = load_unaligned_zeropad(name); b = a ^ REPEAT_BYTE('/'); if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) { adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); a &= zero_bytemask(mask); *lastword = a; len = find_zero(mask); nd->last.hash = fold_hash(a, y); nd->last.len = len; return name + len; } len = 0; x = 0; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); a &= zero_bytemask(mask); x ^= a; len += find_zero(mask); *lastword = 0; // Multi-word components cannot be DOT or DOTDOT nd->last.hash = fold_hash(x, y); nd->last.len = len; return name + len; } /* * Note that the 'last' word is always zero-masked, but * was loaded as a possibly big-endian word. */ #ifdef __BIG_ENDIAN #define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8)) #define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16)) #endif #else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ /* Return the hash of a string of known length */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long hash = init_name_hash(salt); while (len--) hash = partial_name_hash((unsigned char)*name++, hash); return end_name_hash(hash); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long hash = init_name_hash(salt); unsigned long len = 0, c; c = (unsigned char)*name; while (c) { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } return hashlen_create(end_name_hash(hash), len); } EXPORT_SYMBOL(hashlen_string); /* * We know there's a real path component here of at least * one character. */ static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { unsigned long hash = init_name_hash(nd->path.dentry); unsigned long len = 0, c, last = 0; c = (unsigned char)*name; do { last = (last << 8) + c; len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); // This is reliable for DOT or DOTDOT, since the component // cannot contain NUL characters - top bits being zero means // we cannot have had any other pathnames. *lastword = last; nd->last.hash = end_name_hash(hash); nd->last.len = len; return name + len; } #endif #ifndef LAST_WORD_IS_DOT #define LAST_WORD_IS_DOT 0x2e #define LAST_WORD_IS_DOTDOT 0x2e2e #endif /* * Name resolution. * This is the basic name resolution function, turning a pathname into * the final dentry. We expect 'base' to be positive and a directory. * * Returns 0 and nd will have valid dentry and mnt on success. * Returns error and drops reference to input namei data on failure. */ static int link_path_walk(const char *name, struct nameidata *nd) { int depth = 0; // depth <= nd->depth int err; nd->last_type = LAST_ROOT; nd->flags |= LOOKUP_PARENT; if (IS_ERR(name)) return PTR_ERR(name); while (*name=='/') name++; if (!*name) { nd->dir_mode = 0; // short-circuit the 'hardening' idiocy return 0; } /* At this point we know we have a real path component. */ for(;;) { struct mnt_idmap *idmap; const char *link; unsigned long lastword; idmap = mnt_idmap(nd->path.mnt); err = may_lookup(idmap, nd); if (err) return err; nd->last.name = name; name = hash_name(nd, name, &lastword); switch(lastword) { case LAST_WORD_IS_DOTDOT: nd->last_type = LAST_DOTDOT; nd->state |= ND_JUMPED; break; case LAST_WORD_IS_DOT: nd->last_type = LAST_DOT; break; default: nd->last_type = LAST_NORM; nd->state &= ~ND_JUMPED; struct dentry *parent = nd->path.dentry; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { err = parent->d_op->d_hash(parent, &nd->last); if (err < 0) return err; } } if (!*name) goto OK; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. */ do { name++; } while (unlikely(*name == '/')); if (unlikely(!*name)) { OK: /* pathname or trailing symlink, done */ if (!depth) { nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; } /* last component of nested symlink */ name = nd->stack[--depth].name; link = walk_component(nd, 0); } else { /* not the last component */ link = walk_component(nd, WALK_MORE); } if (unlikely(link)) { if (IS_ERR(link)) return PTR_ERR(link); /* a symlink to follow */ nd->stack[depth++].name = name; name = link; continue; } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return -ECHILD; } return -ENOTDIR; } } } /* must be paired with terminate_walk() */ static const char *path_init(struct nameidata *nd, unsigned flags) { int error; const char *s = nd->pathname; /* LOOKUP_CACHED requires RCU, ask caller to retry */ if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) return ERR_PTR(-EAGAIN); if (!*s) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); else nd->seq = nd->next_seq = 0; nd->flags = flags; nd->state |= ND_JUMPED; nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); if (nd->state & ND_ROOT_PRESET) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) return ERR_PTR(-ENOTDIR); nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); nd->root_seq = nd->seq; } else { path_get(&nd->path); } return s; } nd->root.mnt = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); return s; } /* Relative pathname -- get the starting-point it is relative to. */ if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->path = fs->pwd; nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; } } else { /* Caller must check execute permissions on the starting path component */ CLASS(fd_raw, f)(nd->dfd); struct dentry *dentry; if (fd_empty(f)) return ERR_PTR(-EBADF); if (flags & LOOKUP_LINKAT_EMPTY) { if (fd_file(f)->f_cred != current_cred() && !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) return ERR_PTR(-ENOENT); } dentry = fd_file(f)->f_path.dentry; if (*s && unlikely(!d_can_lookup(dentry))) return ERR_PTR(-ENOTDIR); nd->path = fd_file(f)->f_path; if (flags & LOOKUP_RCU) { nd->inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } else { path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } } /* For scoped-lookups we need to set the root to the dirfd as well. */ if (flags & LOOKUP_IS_SCOPED) { nd->root = nd->path; if (flags & LOOKUP_RCU) { nd->root_seq = nd->seq; } else { path_get(&nd->root); nd->state |= ND_ROOT_GRABBED; } } return s; } static inline const char *lookup_last(struct nameidata *nd) { if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; return walk_component(nd, WALK_TRAILING); } static int handle_lookup_down(struct nameidata *nd) { if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); nd->next_seq = nd->seq; return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } /* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); int err; if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) { err = handle_lookup_down(nd); if (unlikely(err < 0)) s = ERR_PTR(err); } while (!(err = link_path_walk(s, nd)) && (s = lookup_last(nd)) != NULL) ; if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { err = handle_lookup_down(nd); nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please... } if (!err) err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; if (!err) { *path = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name, root); retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) retval = path_lookupat(&nd, flags, path); if (unlikely(retval == -ESTALE)) retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) audit_inode(name, path->dentry, flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0); restore_nameidata(); return retval; } /* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) { const char *s = path_init(nd, flags); int err = link_path_walk(s, nd); if (!err) err = complete_walk(nd); if (!err) { *parent = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } /* Note: this does not consume "name" */ static int __filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name, root); retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); if (unlikely(retval == -ECHILD)) retval = path_parentat(&nd, flags, parent); if (unlikely(retval == -ESTALE)) retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent); if (likely(!retval)) { *last = nd.last; *type = nd.last_type; audit_inode(name, parent->dentry, AUDIT_INODE_PARENT); } restore_nameidata(); return retval; } static int filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type) { return __filename_parentat(dfd, name, flags, parent, last, type, NULL); } /* does lookup, returns the object with parent locked */ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) { struct dentry *d; struct qstr last; int type, error; error = filename_parentat(dfd, name, 0, path, &last, &type); if (error) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) { path_put(path); return ERR_PTR(-EINVAL); } inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, path->dentry, 0); if (IS_ERR(d)) { inode_unlock(path->dentry->d_inode); path_put(path); } return d; } struct dentry *kern_path_locked(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path); putname(filename); return res; } struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path) { struct filename *filename = getname(name); struct dentry *res = __kern_path_locked(dfd, filename, path); putname(filename); return res; } EXPORT_SYMBOL(user_path_locked_at); int kern_path(const char *name, unsigned int flags, struct path *path) { struct filename *filename = getname_kernel(name); int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL); putname(filename); return ret; } EXPORT_SYMBOL(kern_path); /** * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair * @filename: filename structure * @flags: lookup flags * @parent: pointer to struct path to fill * @last: last component * @type: type of the last component * @root: pointer to struct path of the base directory */ int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root) { return __filename_parentat(AT_FDCWD, filename, flags, parent, last, type, root); } EXPORT_SYMBOL(vfs_path_parent_lookup); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair * @dentry: pointer to dentry of the base directory * @mnt: pointer to vfs mount of the base directory * @name: pointer to file name * @flags: lookup flags * @path: pointer to struct path to fill */ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct path *path) { struct filename *filename; struct path root = {.mnt = mnt, .dentry = dentry}; int ret; filename = getname_kernel(name); /* the first argument of filename_lookup() is ignored with root */ ret = filename_lookup(AT_FDCWD, filename, flags, path, &root); putname(filename); return ret; } EXPORT_SYMBOL(vfs_path_lookup); static int lookup_one_common(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len, struct qstr *this) { this->name = name; this->len = len; this->hash = full_name_hash(base, name, len); if (!len) return -EACCES; if (is_dot_dotdot(name, len)) return -EACCES; while (len--) { unsigned int c = *(const unsigned char *)name++; if (c == '/' || c == '\0') return -EACCES; } /* * See if the low-level filesystem might want * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { int err = base->d_op->d_hash(base, this); if (err < 0) return err; } return inode_permission(idmap, base->d_inode, MAY_EXEC); } /** * try_lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Look up a dentry by name in the dcache, returning NULL if it does not * currently exist. The function does not try to create a dentry. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); return lookup_dcache(&this, base, 0); } EXPORT_SYMBOL(try_lookup_one_len); /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { struct dentry *dentry; struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0); } EXPORT_SYMBOL(lookup_one_len); /** * lookup_one - filesystem helper to lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The caller must hold base->i_mutex. */ struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *dentry; struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); dentry = lookup_dcache(&this, base, 0); return dentry ? dentry : __lookup_slow(&this, base, 0); } EXPORT_SYMBOL(lookup_one); /** * lookup_one_unlocked - filesystem helper to lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct qstr this; int err; struct dentry *ret; err = lookup_one_common(idmap, name, base, len, &this); if (err) return ERR_PTR(err); ret = lookup_dcache(&this, base, 0); if (!ret) ret = lookup_slow(&this, base, 0); return ret; } EXPORT_SYMBOL(lookup_one_unlocked); /** * lookup_one_positive_unlocked - filesystem helper to lookup single * pathname component * @idmap: idmap of the mount the lookup is performed from * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns * known positive or ERR_PTR(). This is what most of the users want. * * Note that pinned negative with unlocked parent _can_ become positive at any * time, so callers of lookup_one_unlocked() need to be very careful; pinned * positives have >d_inode stable, so this one avoids such problems. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * The helper should be called without i_mutex held. */ struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, const char *name, struct dentry *base, int len) { struct dentry *ret = lookup_one_unlocked(idmap, name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); ret = ERR_PTR(-ENOENT); } return ret; } EXPORT_SYMBOL(lookup_one_positive_unlocked); /** * lookup_one_len_unlocked - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * * Unlike lookup_one_len, it should be called without the parent * i_mutex held, and will take the i_mutex itself if necessary. */ struct dentry *lookup_one_len_unlocked(const char *name, struct dentry *base, int len) { return lookup_one_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_one_len_unlocked); /* * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT) * on negatives. Returns known positive or ERR_PTR(); that's what * most of the users want. Note that pinned negative with unlocked parent * _can_ become positive at any time, so callers of lookup_one_len_unlocked() * need to be very careful; pinned positives have ->d_inode stable, so * this one avoids such problems. */ struct dentry *lookup_positive_unlocked(const char *name, struct dentry *base, int len) { return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len); } EXPORT_SYMBOL(lookup_positive_unlocked); #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { /* Find something mounted on "pts" in the same directory as * the input path. */ struct dentry *parent = dget_parent(path->dentry); struct dentry *child; struct qstr this = QSTR_INIT("pts", 3); if (unlikely(!path_connected(path->mnt, parent))) { dput(parent); return -ENOENT; } dput(path->dentry); path->dentry = parent; child = d_hash_and_lookup(parent, &this); if (IS_ERR_OR_NULL(child)) return -ENOENT; path->dentry = child; dput(parent); follow_down(path, 0); return 0; } #endif int user_path_at(int dfd, const char __user *name, unsigned flags, struct path *path) { struct filename *filename = getname_flags(name, flags); int ret = filename_lookup(dfd, filename, flags, path, NULL); putname(filename); return ret; } EXPORT_SYMBOL(user_path_at); int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid)) return 0; if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid)) return 0; return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER); } EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. * 1. We can't do it if dir is read-only (done in permission()) * 2. We should have write and exec permissions on dir * 3. We can't remove anything from append-only dir * 4. We can't do anything with immutable dir (done in permission()) * 5. If the sticky bit on dir is set we should either * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do antyhing with * links pointing to it. * 7. If the victim has an unknown uid or gid we can't change the inode. * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR. * 10. We can't remove a root or mountpoint. * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ static int may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); int error; if (d_is_negative(victim)) return -ENOENT; BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; if (victim->d_flags & DCACHE_NFSFS_RENAMED) return -EBUSY; return 0; } /* Check whether we can create an object with dentry child in directory * dir. * 1. We can't do it if child already exists (open has special treatment for * this case, but since we are inlined it's OK) * 2. We can't do it if dir is read-only (done in permission()) * 3. We can't do it if the fs can't represent the fsuid or fsgid. * 4. We should have write and exec permissions on dir * 5. We can't do it if dir is immutable (done in permission()) */ static inline int may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } // p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) { struct dentry *p = p1, *q = p2, *r; while ((r = p->d_parent) != p2 && r != p) p = r; if (r == p2) { // p is a child of p2 and an ancestor of p1 or p1 itself inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2); return p; } // p is the root of connected component that contains p1 // p2 does not occur on the path from p to p1 while ((r = q->d_parent) != p1 && r != p && r != q) q = r; if (r == p1) { // q is a child of p1 and an ancestor of p2 or p2 itself inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return q; } else if (likely(r == p)) { // both p2 and p1 are descendents of p inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return NULL; } else { // no common ancestor at the time we'd been called mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); return ERR_PTR(-EXDEV); } } /* * p1 and p2 should be directories on the same fs. */ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) { if (p1 == p2) { inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } mutex_lock(&p1->d_sb->s_vfs_rename_mutex); return lock_two_directories(p1, p2); } EXPORT_SYMBOL(lock_rename); /* * c1 and p2 should be on the same fs. */ struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) { if (READ_ONCE(c1->d_parent) == p2) { /* * hopefully won't need to touch ->s_vfs_rename_mutex at all. */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); /* * now that p2 is locked, nobody can move in or out of it, * so the test below is safe. */ if (likely(c1->d_parent == p2)) return NULL; /* * c1 got moved out of p2 while we'd been taking locks; * unlock and fall back to slow case. */ inode_unlock(p2->d_inode); } mutex_lock(&c1->d_sb->s_vfs_rename_mutex); /* * nobody can move out of any directories on this fs. */ if (likely(c1->d_parent != p2)) return lock_two_directories(c1->d_parent, p2); /* * c1 got moved into p2 while we were taking locks; * we need p2 locked and ->s_vfs_rename_mutex unlocked, * for consistency with lock_rename(). */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); mutex_unlock(&c1->d_sb->s_vfs_rename_mutex); return NULL; } EXPORT_SYMBOL(lock_rename_child); void unlock_rename(struct dentry *p1, struct dentry *p2) { inode_unlock(p1->d_inode); if (p1 != p2) { inode_unlock(p2->d_inode); mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); } } EXPORT_SYMBOL(unlock_rename); /** * vfs_prepare_mode - prepare the mode to be used for a new inode * @idmap: idmap of the mount the inode was found from * @dir: parent directory of the new inode * @mode: mode of the new inode * @mask_perms: allowed permission by the vfs * @type: type of file to be created * * This helper consolidates and enforces vfs restrictions on the @mode of a new * object to be created. * * Umask stripping depends on whether the filesystem supports POSIX ACLs (see * the kernel documentation for mode_strip_umask()). Moving umask stripping * after setgid stripping allows the same ordering for both non-POSIX ACL and * POSIX ACL supporting filesystems. * * Note that it's currently valid for @type to be 0 if a directory is created. * Filesystems raise that flag individually and we need to check whether each * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a * non-zero type. * * Returns: mode to be passed to the filesystem */ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode, umode_t mask_perms, umode_t type) { mode = mode_strip_sgid(idmap, dir, mode); mode = mode_strip_umask(dir, mode); /* * Apply the vfs mandated allowed permission mask and set the type of * file to be created before we call into the filesystem. */ mode &= (mask_perms & ~S_IFMT); mode |= (type & S_IFMT); return mode; } /** * vfs_create - create new file * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child file * @mode: mode of the child file * @want_excl: whether the file must not yet exist * * Create a new file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) { int error; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); error = security_inode_create(dir, dentry, mode); if (error) return error; error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_create); int vfs_mkobj(struct dentry *dentry, umode_t mode, int (*f)(struct dentry *, umode_t, void *), void *arg) { struct inode *dir = dentry->d_parent->d_inode; int error = may_create(&nop_mnt_idmap, dir, dentry); if (error) return error; mode &= S_IALLUGO; mode |= S_IFREG; error = security_inode_create(dir, dentry, mode); if (error) return error; error = f(dentry, mode, arg); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mkobj); bool may_open_dev(const struct path *path) { return !(path->mnt->mnt_flags & MNT_NODEV) && !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } static int may_open(struct mnt_idmap *idmap, const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; if (!inode) return -ENOENT; switch (inode->i_mode & S_IFMT) { case S_IFLNK: return -ELOOP; case S_IFDIR: if (acc_mode & MAY_WRITE) return -EISDIR; if (acc_mode & MAY_EXEC) return -EACCES; break; case S_IFBLK: case S_IFCHR: if (!may_open_dev(path)) return -EACCES; fallthrough; case S_IFIFO: case S_IFSOCK: if (acc_mode & MAY_EXEC) return -EACCES; flag &= ~O_TRUNC; break; case S_IFREG: if ((acc_mode & MAY_EXEC) && path_noexec(path)) return -EACCES; break; } error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); if (error) return error; /* * An append-only file must be opened in append mode for writing. */ if (IS_APPEND(inode)) { if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) return -EPERM; if (flag & O_TRUNC) return -EPERM; } /* O_NOATIME can only be set by the owner or superuser */ if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode)) return -EPERM; return 0; } static int handle_truncate(struct mnt_idmap *idmap, struct file *filp) { const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; int error = get_write_access(inode); if (error) return error; error = security_file_truncate(filp); if (!error) { error = do_truncate(idmap, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } put_write_access(inode); return error; } static inline int open_to_namei_flags(int flag) { if ((flag & O_ACCMODE) == 3) flag--; return flag; } static int may_o_create(struct mnt_idmap *idmap, const struct path *dir, struct dentry *dentry, umode_t mode) { int error = security_path_mknod(dir, dentry, mode, 0); if (error) return error; if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap)) return -EOVERFLOW; error = inode_permission(idmap, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; return security_inode_create(dir->dentry->d_inode, dentry, mode); } /* * Attempt to atomically look up, create and open a file from a negative * dentry. * * Returns 0 if successful. The file will have been created and attached to * @file by the filesystem calling finish_open(). * * If the file was looked up only or didn't need creating, FMODE_OPENED won't * be set. The caller will need to perform the open themselves. @path will * have been updated to point to the new dentry. This may be negative. * * Returns an error code otherwise. */ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, struct file *file, int open_flag, umode_t mode) { struct dentry *const DENTRY_NOT_SET = (void *) -1UL; struct inode *dir = nd->path.dentry->d_inode; int error; if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; file->f_path.dentry = DENTRY_NOT_SET; file->f_path.mnt = nd->path.mnt; error = dir->i_op->atomic_open(dir, dentry, file, open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); if (!error) { if (file->f_mode & FMODE_OPENED) { if (unlikely(dentry != file->f_path.dentry)) { dput(dentry); dentry = dget(file->f_path.dentry); } } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; } else { if (file->f_path.dentry) { dput(dentry); dentry = file->f_path.dentry; } if (unlikely(d_is_negative(dentry))) error = -ENOENT; } } if (error) { dput(dentry); dentry = ERR_PTR(error); } return dentry; } /* * Look up and maybe create and open the last component. * * Must be called with parent locked (exclusive in O_CREAT case). * * Returns 0 on success, that is, if * the file was successfully atomically created (if necessary) and opened, or * the file was not completely opened at this time, though lookups and * creations were performed. * These case are distinguished by presence of FMODE_OPENED on file->f_mode. * In the latter case dentry returned in @path might be negative if O_CREAT * hadn't been specified. * * An error code is returned on failure. */ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, bool got_write) { struct mnt_idmap *idmap; struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) return ERR_PTR(-ENOENT); file->f_mode &= ~FMODE_CREATED; dentry = d_lookup(dir, &nd->last); for (;;) { if (!dentry) { dentry = d_alloc_parallel(dir, &nd->last, &wq); if (IS_ERR(dentry)) return dentry; } if (d_in_lookup(dentry)) break; error = d_revalidate(dentry, nd->flags); if (likely(error > 0)) break; if (error) goto out_dput; d_invalidate(dentry); dput(dentry); dentry = NULL; } if (dentry->d_inode) { /* Cached positive dentry: will open in f_op->open */ return dentry; } if (open_flag & O_CREAT) audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* * Checking write permission is tricky, bacuse we don't know if we are * going to actually need it: O_CREAT opens should work as long as the * file exists. But checking existence breaks atomicity. The trick is * to check access and if not granted clear O_CREAT from the flags. * * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ if (unlikely(!got_write)) open_flag &= ~O_TRUNC; idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode); if (likely(got_write)) create_error = may_o_create(idmap, &nd->path, dentry, mode); else create_error = -EROFS; } if (create_error) open_flag &= ~O_CREAT; if (dir_inode->i_op->atomic_open) { dentry = atomic_open(nd, dentry, file, open_flag, mode); if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) dentry = ERR_PTR(create_error); return dentry; } if (d_in_lookup(dentry)) { struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, nd->flags); d_lookup_done(dentry); if (unlikely(res)) { if (IS_ERR(res)) { error = PTR_ERR(res); goto out_dput; } dput(dentry); dentry = res; } } /* Negative dentry, just create the file */ if (!dentry->d_inode && (open_flag & O_CREAT)) { file->f_mode |= FMODE_CREATED; audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); if (!dir_inode->i_op->create) { error = -EACCES; goto out_dput; } error = dir_inode->i_op->create(idmap, dir_inode, dentry, mode, open_flag & O_EXCL); if (error) goto out_dput; } if (unlikely(create_error) && !dentry->d_inode) { error = create_error; goto out_dput; } return dentry; out_dput: dput(dentry); return ERR_PTR(error); } static inline bool trailing_slashes(struct nameidata *nd) { return (bool)nd->last.name[nd->last.len]; } static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) { struct dentry *dentry; if (open_flag & O_CREAT) { if (trailing_slashes(nd)) return ERR_PTR(-EISDIR); /* Don't bother on an O_EXCL create */ if (open_flag & O_EXCL) return NULL; } if (trailing_slashes(nd)) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; dentry = lookup_fast(nd); if (IS_ERR_OR_NULL(dentry)) return dentry; if (open_flag & O_CREAT) { /* Discard negative dentries. Need inode_lock to do the create */ if (!dentry->d_inode) { if (!(nd->flags & LOOKUP_RCU)) dput(dentry); dentry = NULL; } } return dentry; } static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; struct dentry *dentry; const char *res; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { if (nd->depth) put_link(nd); return handle_dots(nd, nd->last_type); } /* We _can_ be in RCU mode here */ dentry = lookup_fast_for_open(nd, open_flag); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (likely(dentry)) goto finish_lookup; if (!(open_flag & O_CREAT)) { if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) return ERR_PTR(-ECHILD); } else { if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); } } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { got_write = !mnt_want_write(nd->path.mnt); /* * do _not_ fail yet - we might not need that or fail with * a different error; let lookup_open() decide; we'll be * dropping this one anyway. */ } if (open_flag & O_CREAT) inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); dentry = lookup_open(nd, file, op, got_write); if (!IS_ERR(dentry)) { if (file->f_mode & FMODE_CREATED) fsnotify_create(dir->d_inode, dentry); if (file->f_mode & FMODE_OPENED) fsnotify_open(file); } if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else inode_unlock_shared(dir->d_inode); if (got_write) mnt_drop_write(nd->path.mnt); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { dput(nd->path.dentry); nd->path.dentry = dentry; return NULL; } finish_lookup: if (nd->depth) put_link(nd); res = step_into(nd, WALK_TRAILING, dentry); if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); return res; } /* * Handle the last step of open() */ static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct mnt_idmap *idmap; int open_flag = op->open_flag; bool do_truncate; int acc_mode; int error; if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) { error = complete_walk(nd); if (error) return error; } if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) return -EEXIST; if (d_is_dir(nd->path.dentry)) return -EISDIR; error = may_create_in_sticky(idmap, nd, d_backing_inode(nd->path.dentry)); if (unlikely(error)) return error; } if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) return -ENOTDIR; do_truncate = false; acc_mode = op->acc_mode; if (file->f_mode & FMODE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; acc_mode = 0; } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) { error = mnt_want_write(nd->path.mnt); if (error) return error; do_truncate = true; } error = may_open(idmap, &nd->path, acc_mode, open_flag); if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); if (!error) error = security_file_post_open(file, op->acc_mode); if (!error && do_truncate) error = handle_truncate(idmap, file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; } if (do_truncate) mnt_drop_write(nd->path.mnt); return error; } /** * vfs_tmpfile - create tmpfile * @idmap: idmap of the mount the inode was found from * @parentpath: pointer to the path of the base directory * @file: file descriptor of the new tmpfile * @mode: mode of the new tmpfile * * Create a temporary file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode) { struct dentry *child; struct inode *dir = d_inode(parentpath->dentry); struct inode *inode; int error; int open_flag = file->f_flags; /* we want directory to be writable */ error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (!dir->i_op->tmpfile) return -EOPNOTSUPP; child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) return -ENOMEM; file->f_path.mnt = parentpath->mnt; file->f_path.dentry = child; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); if (file->f_mode & FMODE_OPENED) fsnotify_open(file); if (error) return error; /* Don't check for other permissions, the inode was just created */ error = may_open(idmap, &file->f_path, 0, file->f_flags); if (error) return error; inode = file_inode(file); if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } security_inode_post_create_tmpfile(idmap, inode); return 0; } /** * kernel_tmpfile_open - open a tmpfile for kernel internal use * @idmap: idmap of the mount the inode was found from * @parentpath: path of the base directory * @mode: mode of the new tmpfile * @open_flag: flags * @cred: credentials for open * * Create and open a temporary file. The file is not accounted in nr_files, * hence this is only for kernel internal use, and must not be installed into * file tables or such. */ struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred) { struct file *file; int error; file = alloc_empty_file_noaccount(open_flag, cred); if (IS_ERR(file)) return file; error = vfs_tmpfile(idmap, parentpath, file, mode); if (error) { fput(file); file = ERR_PTR(error); } return file; } EXPORT_SYMBOL(kernel_tmpfile_open); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); if (unlikely(error)) return error; error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode); if (error) goto out2; audit_inode(nd->name, file->f_path.dentry, 0); out2: mnt_drop_write(path.mnt); out: path_put(&path); return error; } static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file) { struct path path; int error = path_lookupat(nd, flags, &path); if (!error) { audit_inode(nd->name, path.dentry, 0); error = vfs_open(&path, file); path_put(&path); } return error; } static struct file *path_openat(struct nameidata *nd, const struct open_flags *op, unsigned flags) { struct file *file; int error; file = alloc_empty_file(op->open_flag, current_cred()); if (IS_ERR(file)) return file; if (unlikely(file->f_flags & __O_TMPFILE)) { error = do_tmpfile(nd, flags, op, file); } else if (unlikely(file->f_flags & O_PATH)) { error = do_o_path(nd, flags, file); } else { const char *s = path_init(nd, flags); while (!(error = link_path_walk(s, nd)) && (s = open_last_lookups(nd, file, op)) != NULL) ; if (!error) error = do_open(nd, file, op); terminate_walk(nd); } if (likely(!error)) { if (likely(file->f_mode & FMODE_OPENED)) return file; WARN_ON(1); error = -EINVAL; } fput(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; else error = -ESTALE; } return ERR_PTR(error); } struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op) { struct nameidata nd; int flags = op->lookup_flags; struct file *filp; set_nameidata(&nd, dfd, pathname, NULL); filp = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) filp = path_openat(&nd,